Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

麻烦小彭老师优化一下下 #4

Open
balleb6545anickk opened this issue Aug 26, 2023 · 8 comments
Open

麻烦小彭老师优化一下下 #4

balleb6545anickk opened this issue Aug 26, 2023 · 8 comments

Comments

@balleb6545anickk
Copy link

balleb6545anickk commented Aug 26, 2023

// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

double GetSubPixelValue(const double* preCaculatedParameter, int width, int height, double X, double Y)
{
	int xIndex[6], yIndex[6];
	xIndex[0] = int(X + 0.5) - 2;
	yIndex[0] = int(Y + 0.5) - 2;
	for (int i = 1; i < 5; i++)
	{
		xIndex[i] = xIndex[i - 1] + 1;
		yIndex[i] = yIndex[i - 1] + 1;
	}

	double xWeight[5];
	{
		double w = X - (double)xIndex[2];
		double w2 = w * w;
		double w3 = w * w2;
		double w4 = w2 * w2;
		xWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
		xWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
		xWeight[2] = 0.15 *w4 - 0.222 * w2 + 0.0777;
		xWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
		xWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
	}

	// 下面这个计算方法和上面是一样的
	double yWeight[5];
	{
		double w = Y - (double)yIndex[2];
		double w2 = w * w;
		double w3 = w * w2;
		double w4 = w2 * w2;
		yWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
		yWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
		yWeight[2] = 0.15 *w4 - 0.222 * w2 + 0.0777;
		yWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
		yWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
	}

	int width2 = 2 * width - 2;
	int height2 = 2 * height - 2;
	for (int i = 0; i < 5; i++)
	{
		xIndex[i] = (xIndex[i] < 0) ? (-xIndex[i] - width2 * ((-xIndex[i]) / width2)) : (xIndex[i] - width2 * (xIndex[i] / width2));
		if (width <= xIndex[i])
			xIndex[i] = width2 - xIndex[i];

		yIndex[i] = (yIndex[i] < 0) ? (-yIndex[i] - height2 * ((-yIndex[i]) / height2)) : (yIndex[i] - height2 * (yIndex[i] / height2));
		if (height <= yIndex[i])
			yIndex[i] = height2 - yIndex[i];
	}

	double result = 0;
	for (int i = 0; i < 5; i++)
	{
		for (int j = 0; j < 5; j++)
		{
			result += ((*(preCaculatedParameter + width * yIndex[i] + xIndex[j])) * xWeight[j] * yWeight[i]);
		}
	}
	return result;
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n";


#define TEST_NUM 10000000
int main() {
	int width = 2048;
	int height = 2048;
	std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
	std::vector<double> xs(TEST_NUM);
	std::vector<double> ys(TEST_NUM);
	for (size_t i = 0; i < xs.size(); i++)
	{
		xs[i] = rand() % 2048;
		ys[i] = rand() % 2048;
	}
	std::vector<double> result(1);

	TICK(t);
	for (int i = 0; i < TEST_NUM; i++)
	{
		result[0] = GetSubPixelValue(preCaculatedParameter.data(), width, height, xs[i], ys[i]);
	}
	TOCK(t);

	return 0;
}

@archibate
Copy link
Contributor

我可以用float吗?

@archibate
Copy link
Contributor

这是一个membound内核,瓶颈在于无序访存,只需对xys排序,就可以从0.89秒提升到0.42秒。

@archibate
Copy link
Contributor

#include <cmath>
#include <immintrin.h>

#define WIDTH 4

// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

void GetSubPixelValue(double *__restrict Rs, const double*__restrict preCaculatedParameter, int width, int height, double *__restrict Xs, double *__restrict Ys)
{
    auto c0_5 = _mm256_set1_pd(0.5);
    auto X = _mm256_loadu_pd(Xs);
    auto Y = _mm256_loadu_pd(Ys);
    auto xIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(X, c0_5));
    auto yIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(Y, c0_5));
	/* int xIndex[6], yIndex[6]; */
	/* xIndex[0] = int(X + 0.5) - 2; */
	/* yIndex[0] = int(Y + 0.5) - 2; */
	/* for (int i = 1; i < 5; i++) */
	/* { */
	/* 	xIndex[i] = xIndex[i - 1] + 1; */
	/* 	yIndex[i] = yIndex[i - 1] + 1; */
	/* } */

    auto c0_111 = _mm256_set1_pd(0.111);
    auto c0_222 = _mm256_set1_pd(0.222);
    auto c0_233 = _mm256_set1_pd(0.233);
    auto c0_344 = _mm256_set1_pd(0.344);
    auto c0_15 = _mm256_set1_pd(0.15);
    auto c0_333 = _mm256_set1_pd(0.333);
    auto c0_444 = _mm256_set1_pd(0.444);
    auto c0_555 = _mm256_set1_pd(0.555);
    auto c0_0666 = _mm256_set1_pd(0.0666);
    auto c0_0777 = _mm256_set1_pd(0.0777);
    auto c0_345 = _mm256_set1_pd(0.345);
    auto c1o22 = _mm256_set1_pd(1.0 / 22.0);

	__m256d xWeight[5];
    __m256d yWeight[5];
    auto xw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(xIndex2));
    auto xw2 = _mm256_mul_pd(xw, xw);
    auto xw3 = _mm256_mul_pd(xw, xw2);
    auto xw4 = _mm256_mul_pd(xw2, xw2);
    auto yw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(yIndex2));
    auto yw2 = _mm256_mul_pd(yw, yw);
    auto yw3 = _mm256_mul_pd(yw, yw2);
    auto yw4 = _mm256_mul_pd(yw2, yw2);
    xWeight[0] = _mm256_fmsub_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmsub_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[0] = _mm256_fmsub_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmsub_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));
    xWeight[1] = _mm256_fmadd_pd(c0_333, xw3, _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_555, xw, _mm256_fmadd_pd(c0_333, xw4, c0_0666))));
    yWeight[1] = _mm256_fmadd_pd(c0_333, yw3, _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_555, yw, _mm256_fmadd_pd(c0_333, yw4, c0_0666))));
    xWeight[2] = _mm256_fmsub_pd(c0_15, xw4, _mm256_fmadd_pd(c0_222, xw2, c0_0777));
    yWeight[2] = _mm256_fmsub_pd(c0_15, yw4, _mm256_fmadd_pd(c0_222, yw2, c0_0777));
    xWeight[3] = _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_333, xw4, _mm256_fmadd_pd(c0_333, xw3, _mm256_fmadd_pd(c0_555, xw, c0_0666))));
    yWeight[3] = _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_333, yw4, _mm256_fmadd_pd(c0_333, yw3, _mm256_fmadd_pd(c0_555, yw, c0_0666))));
    xWeight[4] = _mm256_fmadd_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmadd_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[4] = _mm256_fmadd_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmadd_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));

#ifndef USE_BIT
    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    auto v_width = _mm_set1_epi32(width);
    auto v_height = _mm_set1_epi32(height);
    auto v_width2 = _mm256_set1_pd((double)width2);
    auto v_height2 = _mm256_set1_pd((double)height2);
    auto v_width2i = _mm_set1_epi32(width2);
    auto v_height2i = _mm_set1_epi32(height2);
#endif
	__m128i xIndex[5];
	__m128i yIndex[5];
    auto vi = _mm_set1_epi64x(-2);
    auto c1 = _mm_set1_epi64x(1);
#ifdef USE_BIT
    auto wmask = _mm_set1_epi32(2047);
    auto hmask = _mm_set1_epi32(2047);
#endif
	for (int i = 0; i < 5; i++)
	{
        auto xIndexI = _mm_add_epi32(xIndex2, vi);
        vi = _mm_add_epi32(vi, c1);
      #ifdef USE_BIT
        xIndexI = _mm_and_si128(xIndexI, wmask);
      #else
		xIndexI = _mm_abs_epi32(xIndexI);
		xIndexI = _mm_sub_epi32(xIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_width2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(xIndexI), v_width2)))));
		        xIndexI = _mm_blendv_epi8(xIndexI, _mm_sub_epi32(v_width2i, xIndexI), _mm_or_si128(_mm_cmpgt_epi32(xIndexI, v_width), _mm_cmpeq_epi32(xIndexI, v_width)));
      #endif
        xIndex[i] = xIndexI;
        auto yIndexI = yIndex2;
      #ifdef USE_BIT
        yIndexI = _mm_and_si128(xIndexI, hmask);
        yIndexI = _mm_slli_epi32(yIndexI, 11);
      #else
		yIndexI = _mm_abs_epi32(yIndexI);
		yIndexI = _mm_sub_epi32(yIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_height2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(yIndexI), v_height2)))));
		        yIndexI = _mm_blendv_epi8(yIndexI, _mm_sub_epi32(v_height2i, yIndexI), _mm_or_si128(_mm_cmpgt_epi32(yIndexI, v_height), _mm_cmpeq_epi32(yIndexI, v_height)));
        yIndexI = _mm_mullo_epi32(yIndexI, v_width);
      #endif
        yIndex[i] = yIndexI;
	}

    auto result = _mm256_setzero_pd();
	for (int i = 0; i < 5; i++)
	{
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
      #pragma GCC unroll 5
		for (int j = 0; j < 5; j++)
		{
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 1);
            rowResult = _mm256_fmadd_pd(rowResult, factor, xWeight[j]);
		}
        result = _mm256_fmadd_pd(result, rowResult, yWeight[i]);
	}
    _mm256_storeu_pd(Rs, result);
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"\n";


#define TEST_NUM 10000000
int main() {
	int width = 2048;
	int height = 2048;
	std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
	std::vector<double> xs(TEST_NUM);
	std::vector<double> ys(TEST_NUM);
	for (size_t i = 0; i < xs.size(); i++)
	{
		xs[i] = rand() % 2048;
		ys[i] = rand() % 2048;
	}
	std::vector<double> result(TEST_NUM);

	TICK(t);
    /* #pragma omp parallel for */
	for (size_t i = 0; i < TEST_NUM / WIDTH * WIDTH; i += WIDTH)
	{
		GetSubPixelValue(result.data() + i, preCaculatedParameter.data(), width, height, xs.data() + i, ys.data() + i);
	}
#ifdef __GNUC__
        asm volatile ("" ::: "cc", "memory");
#endif
	TOCK(t);

	return 0;
}

优化前:1.01623秒
优化后:0.104965秒
优化后+开启并行:0.0238891秒

@archibate
Copy link
Contributor

使用i32gather优化了一下随机访存,如果能够改用float还可以进一步提升。

@balleb6545anickk
Copy link
Author

小彭老师,你上面的代码有点问题,我改了一下,加速比会变小。
我在台式机i9-13900k上测试,差不多是0.62s->0.4s。
代码我都看完了,思路是对的,我之前的思路应该是错的。
我觉得可能还可以再优化一下。

#include <cmath>
#include <immintrin.h>

#define WIDTH 4

// 以下是项目中的一段热点代码
// 会调用这个函数很多次,用来计算亚像素的像素值,方法是插值(具体算法可以不用管),里头的具体魔数我改了一下,因为是公司的代码,而且跟优化没有关系

void GetSubPixelValue(double* __restrict Rs, const double* __restrict preCaculatedParameter, int width, int height, double* __restrict Xs, double* __restrict Ys)
{
    auto c0_5 = _mm256_set1_pd(0.5);
    auto X = _mm256_loadu_pd(Xs);
    auto Y = _mm256_loadu_pd(Ys);
    auto xIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(X, c0_5));
    auto yIndex2 = _mm256_cvttpd_epi32(_mm256_add_pd(Y, c0_5));
    /* int xIndex[6], yIndex[6]; */
    /* xIndex[0] = int(X + 0.5) - 2; */
    /* yIndex[0] = int(Y + 0.5) - 2; */
    /* for (int i = 1; i < 5; i++) */
    /* { */
    /* 	xIndex[i] = xIndex[i - 1] + 1; */
    /* 	yIndex[i] = yIndex[i - 1] + 1; */
    /* } */

    auto c0_111 = _mm256_set1_pd(0.111);
    auto c0_222 = _mm256_set1_pd(-0.222);
    auto c0_233 = _mm256_set1_pd(0.233);
    auto c0_344 = _mm256_set1_pd(0.344);
    auto c0_15 = _mm256_set1_pd(0.15);
    auto c0_333 = _mm256_set1_pd(0.333);
    auto c0_444 = _mm256_set1_pd(0.444);
    auto c0_555 = _mm256_set1_pd(0.555);
    auto c0_0666 = _mm256_set1_pd(0.0666);
    auto c0_0777 = _mm256_set1_pd(0.0777);
    auto c0_345 = _mm256_set1_pd(0.345);
    auto c1o22 = _mm256_set1_pd(1.0 / 22.0);

    __m256d xWeight[5];
    __m256d yWeight[5];
    auto xw = _mm256_sub_pd(X, _mm256_cvtepi32_pd(xIndex2));
    auto xw2 = _mm256_mul_pd(xw, xw);
    auto xw3 = _mm256_mul_pd(xw, xw2);
    auto xw4 = _mm256_mul_pd(xw2, xw2);
    auto yw = _mm256_sub_pd(Y, _mm256_cvtepi32_pd(yIndex2));
    auto yw2 = _mm256_mul_pd(yw, yw);
    auto yw3 = _mm256_mul_pd(yw, yw2);
    auto yw4 = _mm256_mul_pd(yw2, yw2);
    xWeight[0] = _mm256_fmsub_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmsub_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[0] = _mm256_fmsub_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmsub_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));
    xWeight[1] = _mm256_fmadd_pd(c0_333, xw3, _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_555, xw, _mm256_fmadd_pd(c0_333, xw4, c0_0666))));
    yWeight[1] = _mm256_fmadd_pd(c0_333, yw3, _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_555, yw, _mm256_fmadd_pd(c0_333, yw4, c0_0666))));
    xWeight[2] = _mm256_fmadd_pd(c0_15, xw4, _mm256_fmadd_pd(c0_222, xw2, c0_0777));
    yWeight[2] = _mm256_fmadd_pd(c0_15, yw4, _mm256_fmadd_pd(c0_222, yw2, c0_0777));
    xWeight[3] = _mm256_fmsub_pd(c0_444, xw2, _mm256_fmsub_pd(c0_333, xw4, _mm256_fmadd_pd(c0_333, xw3, _mm256_fmadd_pd(c0_555, xw, c0_0666))));
    yWeight[3] = _mm256_fmsub_pd(c0_444, yw2, _mm256_fmsub_pd(c0_333, yw4, _mm256_fmadd_pd(c0_333, yw3, _mm256_fmadd_pd(c0_555, yw, c0_0666))));
    xWeight[4] = _mm256_fmadd_pd(c0_111, xw4, _mm256_fmadd_pd(xw3, c1o22, _mm256_fmadd_pd(c0_233, xw2, _mm256_fmadd_pd(c0_344, xw, c0_345))));
    yWeight[4] = _mm256_fmadd_pd(c0_111, yw4, _mm256_fmadd_pd(yw3, c1o22, _mm256_fmadd_pd(c0_233, yw2, _mm256_fmadd_pd(c0_344, yw, c0_345))));
    
    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    auto v_width = _mm_set1_epi32(width);
    auto v_height = _mm_set1_epi32(height);
    auto v_width2 = _mm256_set1_pd((double)width2);
    auto v_height2 = _mm256_set1_pd((double)height2);
    auto v_width2i = _mm_set1_epi32(width2);
    auto v_height2i = _mm_set1_epi32(height2);
    __m128i xIndex[5];
    __m128i yIndex[5];
// 改了下面这两句会影响速度
    auto vi = _mm_set1_epi32(-2); 
    auto c1 = _mm_set1_epi32(1);

    for (int i = 0; i < 5; i++)
    {
        auto xIndexI = _mm_add_epi32(xIndex2, vi);
        xIndexI = _mm_abs_epi32(xIndexI);
        xIndexI = _mm_sub_epi32(xIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_width2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(xIndexI), v_width2)))));
        xIndexI = _mm_blendv_epi8(xIndexI, _mm_sub_epi32(v_width2i, xIndexI), _mm_or_si128(_mm_cmpgt_epi32(xIndexI, v_width), _mm_cmpeq_epi32(xIndexI, v_width)));
        xIndex[i] = xIndexI;
        auto yIndexI = _mm_add_epi32(yIndex2, vi);
        yIndexI = _mm_abs_epi32(yIndexI);
        yIndexI = _mm_sub_epi32(yIndexI, _mm256_cvttpd_epi32(_mm256_mul_pd(v_height2, _mm256_floor_pd(_mm256_div_pd(_mm256_cvtepi32_pd(yIndexI), v_height2)))));
        yIndexI = _mm_blendv_epi8(yIndexI, _mm_sub_epi32(v_height2i, yIndexI), _mm_or_si128(_mm_cmpgt_epi32(yIndexI, v_height), _mm_cmpeq_epi32(yIndexI, v_height)));
        yIndexI = _mm_mullo_epi32(yIndexI, v_width);
        yIndex[i] = yIndexI;
        vi = _mm_add_epi32(vi, c1);
    }

    auto result = _mm256_setzero_pd();
    for (int i = 0; i < 5; i++)
    {
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
#pragma GCC unroll 5
        for (int j = 0; j < 5; j++)
        {
            auto asdfafsd = _mm_add_epi32(yIndexI, xIndex[j]);
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 8); // 这个是8不是1
            rowResult = _mm256_fmadd_pd(factor, xWeight[j], rowResult);
        }
        result = _mm256_fmadd_pd(rowResult, yWeight[i], result);
    }
    _mm256_storeu_pd(Rs, result);
}

double GetSubPixelValueOrigin(const double* preCaculatedParameter, int width, int height, double X, double Y)
{
    int xIndex[6], yIndex[6];
    xIndex[0] = int(X + 0.5) - 2;
    yIndex[0] = int(Y + 0.5) - 2;
    for (int i = 1; i < 5; i++)
    {
        xIndex[i] = xIndex[i - 1] + 1;
        yIndex[i] = yIndex[i - 1] + 1;
    }

    double xWeight[5];
    {
        double w = X - (double)xIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        xWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        xWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        xWeight[2] = 0.15 * w4 - 0.222 * w2 + 0.0777;
        xWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        xWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    // 下面这个计算方法和上面是一样的
    double yWeight[5];
    {
        double w = Y - (double)yIndex[2];
        double w2 = w * w;
        double w3 = w * w2;
        double w4 = w2 * w2;
        yWeight[0] = 0.111 * w4 - w3 / 22.0 + 0.233 * w2 - 0.344 * w + 0.345;
        yWeight[1] = -0.333 * w4 + 0.333 * w3 + 0.444 * w2 - 0.555 * w + 0.0666;
        yWeight[2] = 0.15 * w4 - 0.222 * w2 + 0.0777;
        yWeight[3] = -0.333 * w4 - 0.333 * w3 + 0.444 * w2 + 0.555 * w + 0.0666;
        yWeight[4] = 0.111 * w4 + w3 / 22.0 + 0.233 * w2 + 0.344 * w + 0.345;
    }

    int width2 = 2 * width - 2;
    int height2 = 2 * height - 2;
    for (int i = 0; i < 5; i++)
    {
        xIndex[i] = (xIndex[i] < 0) ? (-xIndex[i] - width2 * ((-xIndex[i]) / width2)) : (xIndex[i] - width2 * (xIndex[i] / width2));
        if (width <= xIndex[i])
            xIndex[i] = width2 - xIndex[i];

        yIndex[i] = (yIndex[i] < 0) ? (-yIndex[i] - height2 * ((-yIndex[i]) / height2)) : (yIndex[i] - height2 * (yIndex[i] / height2));
        if (height <= yIndex[i])
            yIndex[i] = height2 - yIndex[i];
    }

    double result = 0;
    for (int i = 0; i < 5; i++)
    {
        for (int j = 0; j < 5; j++)
        {
            result += ((*(preCaculatedParameter + width * yIndex[i] + xIndex[j])) * xWeight[j] * yWeight[i]);
        }
    }
    return result;
}

#include <vector>
#include <chrono>
#include <iostream>

#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
#define TOCK(x) std::cerr<<#x ": "<<std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()-bench_##x).count();std::cerr<<"秒\n";

double fRand(double fMin, double fMax)
{
    double f = (double)rand() / RAND_MAX;
    return fMin + f * (fMax - fMin);
}

#define TEST_NUM 10000000
int main() {
    int width = 2048;
    int height = 2048;
    std::vector<double> preCaculatedParameter(width * height, 0.123); // 里面是预计算的数字
    for (size_t i = 0; i < preCaculatedParameter.size(); i++)
    {
        preCaculatedParameter[i] = fRand(0, 1);
    }

    std::vector<double> xs(TEST_NUM);
    std::vector<double> ys(TEST_NUM);
    for (size_t i = 0; i < xs.size(); i++)
    {
        xs[i] = rand() % 2048;
        ys[i] = rand() % 2048;
    }
    std::vector<double> result(TEST_NUM);

    TICK(t);
    /* #pragma omp parallel for */
    for (size_t i = 0; i < TEST_NUM / WIDTH * WIDTH; i += WIDTH)
    {
        GetSubPixelValue(result.data() + i, preCaculatedParameter.data(), width, height, xs.data() + i, ys.data() + i);
    }
#ifdef __GNUC__
    asm volatile ("" ::: "cc", "memory");
#endif
    TOCK(t);

   // 以下的测试是单独测试的,不会一起测试,也就是一次只测试一个程序
   // std::vector<double> resultOrigin(TEST_NUM);
    //TICK(t_origin);
    //for (size_t i = 0; i < TEST_NUM; i++)
   // {
   //     resultOrigin[i] = GetSubPixelValueOrigin(preCaculatedParameter.data(), width, height, xs[i], ys[i]);
  //  }
  //  TOCK(t_origin);
    return 0;
}

@balleb6545anickk
Copy link
Author

主要是下面这段会很耗时,访存瓶颈会很严重,是不是有办法优化一下
其实这段话就是在求preCaculatedParameter里面5*5的方块的加权和,可能可以优化。

    auto result = _mm256_setzero_pd();
    for (int i = 0; i < 5; i++)
    {
        auto rowResult = _mm256_setzero_pd();
        auto yIndexI = yIndex[i];
#pragma GCC unroll 5
        for (int j = 0; j < 5; j++)
        {
            auto asdfafsd = _mm_add_epi32(yIndexI, xIndex[j]);
            auto factor = _mm256_i32gather_pd(preCaculatedParameter, _mm_add_epi32(yIndexI, xIndex[j]), 8); // 这个是8不是1
            rowResult = _mm256_fmadd_pd(factor, xWeight[j], rowResult);
        }
        result = _mm256_fmadd_pd(rowResult, yWeight[i], result);
    }

@archibate
Copy link
Contributor

可不可以把precalculatedparameter的范围提前扩大,扩大的里面填充上mirror,这样就可以全部用连续访问了。

@balleb6545anickk
Copy link
Author

balleb6545anickk commented Aug 27, 2023

可不可以把precalculatedparameter的范围提前扩大,扩大的里面填充上mirror,这样就可以全部用连续访问了。

可以做mirror padding,甚至实际情况我们可以简化这个问题,就假设xIndex[5]和yIndex[5]是连续的而且不越界,也就是最后是卷积核是5*5的xWeight和yWeight,和precalculatedparameter卷积

archibate added a commit that referenced this issue Aug 31, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants