Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed performance optimized by 30 times #39

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"files.associations": {
"chrono": "cpp",
"cmath": "cpp"
}
}
10 changes: 9 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@ cmake_minimum_required(VERSION 3.12)
project(hellocmake LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_DEBUG"-g")
set(CMAKE_CXX_FLAGS_RELEASE"-O3")

# target_compile_options(testbench PUBLIC -ffast-math -march=native)
# find_package(OpenMP REQUIRED)
# target_link_libraries(testbench PUBLIC OpenMP::OpenMP_CXX)

if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

add_executable(main main.cpp)
add_executable(main main.cpp)
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,47 @@
- 不允许做算法复杂度优化
- 可以针对编译器和平台优化,这次不要求跨平台
- 可以用 xmmintrin.h,如果你觉得编译器靠不住的话

初始数据:

Initial energy: -8.571526
Final energy: -8.511777
Time elapsed: 6646 ms

编译指令加入O3优化:

Initial energy: -8.571526
Final energy: -8.511777
Time elapsed: 1737 ms

将结构体OOP改成DOP

Initial energy: -8.571526
Final energy: -8.511777
Time elapsed: 1734 ms

加入编译指令

```
#pragma GCC ivdep
#pragma GCC unroll 4
```

Initial energy: -8.571302
Final energy: -8.511518
Time elapsed: 1587 ms

加上暴力火车头:

Initial energy: -8.571527
Final energy: -8.511723
Time elapsed: 1175 ms


加入编译指令:

`-ffast-math -march=native`

Initial energy: -8.571527
Final energy: -8.511747
Time elapsed: 210 ms
88 changes: 88 additions & 0 deletions initial.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>

float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}

struct Star {
float px, py, pz;
float vx, vy, vz;
float mass;
};

std::vector<Star> stars;

void init() {
for (int i = 0; i < 48; i++) {
stars.push_back({
frand(), frand(), frand(),
frand(), frand(), frand(),
frand() + 1,
});
}
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
for (auto &star: stars) {
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
d2 *= sqrt(d2);
star.vx += dx * other.mass * G * dt / d2;
star.vy += dy * other.mass * G * dt / d2;
star.vz += dz * other.mass * G * dt / d2;
}
}
for (auto &star: stars) {
star.px += star.vx * dt;
star.py += star.vy * dt;
star.pz += star.vz * dt;
}
}

float calc() {
float energy = 0;
for (auto &star: stars) {
float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
energy += star.mass * v2 / 2;
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
energy -= other.mass * star.mass * G / sqrt(d2) / 2;
}
}
return energy;
}

template <class Func>
long benchmark(Func const &func) {
auto t0 = std::chrono::steady_clock::now();
func();
auto t1 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
return dt.count();
}

int main() {
init();
printf("Initial energy: %f\n", calc());
auto dt = benchmark([&] {
for (int i = 0; i < 100000; i++)
step();
});
printf("Final energy: %f\n", calc());
printf("Time elapsed: %ld ms\n", dt);
return 0;
}
Binary file added initial.exe
Binary file not shown.
127 changes: 89 additions & 38 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,64 +3,115 @@
#include <vector>
#include <chrono>
#include <cmath>
#include <x86intrin.h>
#include <mmintrin.h> //mmx
#include <xmmintrin.h> //sse
#include <emmintrin.h> //sse2
#include <pmmintrin.h> //sse3

#pragma GCC target("avx")
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fhoist-adjacent-loads")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")


float frand() {
return (float)rand() / RAND_MAX * 2 - 1;
}

struct Star {
float px, py, pz;
float vx, vy, vz;
float mass;
};

std::vector<Star> stars;
__declspec(align(16)) float px[48],py[48],pz[48];
__declspec(align(16)) float vx[48],vy[48],vz[48];
__declspec(align(16)) float mass[48];

void init() {
for (int i = 0; i < 48; i++) {
stars.push_back({
frand(), frand(), frand(),
frand(), frand(), frand(),
frand() + 1,
});
for (uint32_t i = 0; i < 48; i++) {
px[i] = frand();py[i]=frand();pz[i] = frand();
vx[i] = frand();vy[i]=frand();vz[i]=frand();
mass[i] = frand()+1;
}
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;
constexpr float G = 0.001;
constexpr float eps = 0.001;
constexpr float dt = 0.01;

void step() {
for (auto &star: stars) {
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
float dx,dy,dz,d2;
for (size_t i=0;i<(uint32_t)48;++i) {
for (size_t j=0;j<(uint32_t)48;++j) {
// #pragma omp simd
dx = px[j] - px[i];
dy = py[j] - py[i];
dz = pz[j] - pz[i];
d2 = dx * dx + dy * dy + dz * dz + (eps * eps);
d2 *= sqrt(d2);
star.vx += dx * other.mass * G * dt / d2;
star.vy += dy * other.mass * G * dt / d2;
star.vz += dz * other.mass * G * dt / d2;
d2 = mass[j] * G * dt / d2;
vx[i] += dx * d2;
vy[i] += dy * d2;
vz[i] += dz * d2;
}
}
for (auto &star: stars) {
star.px += star.vx * dt;
star.py += star.vy * dt;
star.pz += star.vz * dt;
for(size_t i=0;i<48; ++i){
// #pragma omp simd
px[i] += vx[i] * dt;
py[i] += vy[i] * dt;
pz[i] += vz[i] * dt;
}
}

float calc() {
float dx,dy,dz,d2;
float energy = 0;
for (auto &star: stars) {
float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
energy += star.mass * v2 / 2;
for (auto &other: stars) {
float dx = other.px - star.px;
float dy = other.py - star.py;
float dz = other.pz - star.pz;
float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
energy -= other.mass * star.mass * G / sqrt(d2) / 2;
for (size_t i=0;i<48;++i) {
// #pragma omp simd
float v2 = vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i];
energy += mass[i] * v2 * 0.5;
for (size_t j=0;j<48;++j) {
// #pragma omp simd
dx = px[j] - px[i];
dy = py[j] - py[i];
dz = pz[j] - pz[i];
d2 = (dx * dx + dy * dy + dz * dz + (eps * eps));
energy -= mass[j] * mass[i] * 0.0005 / sqrt(d2);
}
}
return energy;
Expand Down
Binary file added main.exe
Binary file not shown.
6 changes: 6 additions & 0 deletions rbq.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
g++ -o main.exe main.cpp -ffast-math -march=native -msse4.1 -O2 -fopenmp -O3 -std=c++17
main.exe
main.exe
main.exe
main.exe
main.exe