parallel101 · xu-kq · Jan 11, 2022 · Jan 11, 2022 · Jan 12, 2022 · Jan 12, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,3 +7,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 add_executable(main main.cpp)
+
+find_package(OpenMP REQUIRED)
+target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
+target_compile_options(main PUBLIC -ffast-math -march=native)
diff --git a/main.cpp b/main.cpp
@@ -12,11 +12,13 @@ struct Star {
     float px, py, pz;
     float vx, vy, vz;
     float mass;
+    char padding[12];
 };
 
 std::vector<Star> stars;
 
 void init() {
+    #pragma unroll
     for (int i = 0; i < 48; i++) {
         stars.push_back({
             frand(), frand(), frand(),
@@ -26,23 +28,32 @@ void init() {
     }
 }
 
-float G = 0.001;
-float eps = 0.001;
-float dt = 0.01;
+float G = 0.001f;
+float eps = 0.001f;
+float dt = 0.01f;
 
 void step() {
+    float eps2 = eps * eps;
     for (auto &star: stars) {
+        float tmp_vx = 0, tmp_vy = 0, tmp_vz = 0;
+        float factor = G * dt;
+        #pragma omp simd
         for (auto &other: stars) {
             float dx = other.px - star.px;
             float dy = other.py - star.py;
             float dz = other.pz - star.pz;
-            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
-            d2 *= sqrt(d2);
-            star.vx += dx * other.mass * G * dt / d2;
-            star.vy += dy * other.mass * G * dt / d2;
-            star.vz += dz * other.mass * G * dt / d2;
+            float d2 = dx * dx + dy * dy + dz * dz + eps2;
+            d2 *= std::sqrt(d2);
+            float div = 1.0f / d2;
+            tmp_vx += dx * other.mass * factor * div;
+            tmp_vy += dy * other.mass * factor * div;
+            tmp_vz += dz * other.mass * factor * div;
         }
+        star.vx += tmp_vx;
+        star.vy += tmp_vy;
+        star.vz += tmp_vz;
     }
+    #pragma unroll
     for (auto &star: stars) {
         star.px += star.vx * dt;
         star.py += star.vy * dt;
@@ -52,15 +63,17 @@ void step() {
 
 float calc() {
     float energy = 0;
+    float eps2 = eps * eps;
     for (auto &star: stars) {
         float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
         energy += star.mass * v2 / 2;
+        #pragma omp simd
         for (auto &other: stars) {
             float dx = other.px - star.px;
             float dy = other.py - star.py;
             float dz = other.pz - star.pz;
-            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
-            energy -= other.mass * star.mass * G / sqrt(d2) / 2;
+            float d2 = dx * dx + dy * dy + dz * dz + eps2;
+            energy -= other.mass * star.mass * G / std::sqrt(d2) / 2;
         }
     }
     return energy;
@@ -79,6 +92,7 @@ int main() {
     init();
     printf("Initial energy: %f\n", calc());
     auto dt = benchmark([&] {
+    #pragma unroll
         for (int i = 0; i < 100000; i++)
             step();
     });