From c3776bbccc6e0f8b5b4654ccece7016a950973ec Mon Sep 17 00:00:00 2001 From: felixpalmer Date: Fri, 20 Sep 2024 19:24:59 +0200 Subject: [PATCH] chore(shadertools): Port fp64 module to UBO (#2262) --- modules/shadertools/src/index.ts | 4 +- .../modules/math/fp64/fp64-arithmetic-glsl.ts | 174 +++++ .../modules/math/fp64/fp64-functions-glsl.ts | 675 ++++++++++++++++++ .../shadertools/src/modules/math/fp64/fp64.ts | 55 ++ modules/shadertools/test/index.ts | 1 + .../math/fp64-arithmetic-transform.spec.ts | 155 ++++ .../modules/math/fp64-test-utils-transform.ts | 130 ++++ 7 files changed, 1192 insertions(+), 2 deletions(-) create mode 100644 modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts create mode 100644 modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts create mode 100644 modules/shadertools/src/modules/math/fp64/fp64.ts create mode 100644 modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts create mode 100644 modules/shadertools/test/modules/math/fp64-test-utils-transform.ts diff --git a/modules/shadertools/src/index.ts b/modules/shadertools/src/index.ts index 8996b6f626..1e4f395295 100644 --- a/modules/shadertools/src/index.ts +++ b/modules/shadertools/src/index.ts @@ -63,7 +63,7 @@ export {fp64ify, fp64LowPart, fp64ifyMatrix4} from './modules/math/fp64/fp64-uti export {random} from './modules/math/random/random'; export {fp32} from './modules/math/fp32/fp32'; -// export {fp64, fp64arithmetic} from './modules/math/fp64/fp64'; +export {fp64, fp64arithmetic} from './modules/math/fp64/fp64'; // engine shader modules @@ -93,7 +93,7 @@ export {pbrMaterial} from './modules/lighting/pbr-material/pbr-material'; // DEPRECATED - v8 legacy shader modules (non-uniform buffer) // math libraries -export {fp64, fp64arithmetic} from './modules-webgl1/math/fp64/fp64'; +// export {fp64, fp64arithmetic} from './modules-webgl1/math/fp64/fp64'; // projection and lighting export {geometry as geometry1} from './modules-webgl1/geometry/geometry'; diff --git a/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts b/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts new file mode 100644 index 0000000000..7ef5e544de --- /dev/null +++ b/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts @@ -0,0 +1,174 @@ +// luma.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +export const fp64arithmeticShader = /* glsl */ `\ + +uniform fp64arithmeticUniforms { + uniform float ONE; +} fp64; + +/* +About LUMA_FP64_CODE_ELIMINATION_WORKAROUND + +The purpose of this workaround is to prevent shader compilers from +optimizing away necessary arithmetic operations by swapping their sequences +or transform the equation to some 'equivalent' form. + +The method is to multiply an artifical variable, ONE, which will be known to +the compiler to be 1 only at runtime. The whole expression is then represented +as a polynomial with respective to ONE. In the coefficients of all terms, only one a +and one b should appear + +err = (a + b) * ONE^6 - a * ONE^5 - (a + b) * ONE^4 + a * ONE^3 - b - (a + b) * ONE^2 + a * ONE +*/ + +// Divide float number to high and low floats to extend fraction bits +vec2 split(float a) { + const float SPLIT = 4097.0; + float t = a * SPLIT; +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + float a_hi = t * fp64.ONE - (t - a); + float a_lo = a * fp64.ONE - a_hi; +#else + float a_hi = t - (t - a); + float a_lo = a - a_hi; +#endif + return vec2(a_hi, a_lo); +} + +// Divide float number again when high float uses too many fraction bits +vec2 split2(vec2 a) { + vec2 b = split(a.x); + b.y += a.y; + return b; +} + +// Special sum operation when a > b +vec2 quickTwoSum(float a, float b) { +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + float sum = (a + b) * fp64.ONE; + float err = b - (sum - a) * fp64.ONE; +#else + float sum = a + b; + float err = b - (sum - a); +#endif + return vec2(sum, err); +} + +// General sum operation +vec2 twoSum(float a, float b) { + float s = (a + b); +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + float v = (s * fp64.ONE - a) * fp64.ONE; + float err = (a - (s - v) * fp64.ONE) * fp64.ONE * fp64.ONE * fp64.ONE + (b - v); +#else + float v = s - a; + float err = (a - (s - v)) + (b - v); +#endif + return vec2(s, err); +} + +vec2 twoSub(float a, float b) { + float s = (a - b); +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + float v = (s * fp64.ONE - a) * fp64.ONE; + float err = (a - (s - v) * fp64.ONE) * fp64.ONE * fp64.ONE * fp64.ONE - (b + v); +#else + float v = s - a; + float err = (a - (s - v)) - (b + v); +#endif + return vec2(s, err); +} + +vec2 twoSqr(float a) { + float prod = a * a; + vec2 a_fp64 = split(a); +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + float err = ((a_fp64.x * a_fp64.x - prod) * fp64.ONE + 2.0 * a_fp64.x * + a_fp64.y * fp64.ONE * fp64.ONE) + a_fp64.y * a_fp64.y * fp64.ONE * fp64.ONE * fp64.ONE; +#else + float err = ((a_fp64.x * a_fp64.x - prod) + 2.0 * a_fp64.x * a_fp64.y) + a_fp64.y * a_fp64.y; +#endif + return vec2(prod, err); +} + +vec2 twoProd(float a, float b) { + float prod = a * b; + vec2 a_fp64 = split(a); + vec2 b_fp64 = split(b); + float err = ((a_fp64.x * b_fp64.x - prod) + a_fp64.x * b_fp64.y + + a_fp64.y * b_fp64.x) + a_fp64.y * b_fp64.y; + return vec2(prod, err); +} + +vec2 sum_fp64(vec2 a, vec2 b) { + vec2 s, t; + s = twoSum(a.x, b.x); + t = twoSum(a.y, b.y); + s.y += t.x; + s = quickTwoSum(s.x, s.y); + s.y += t.y; + s = quickTwoSum(s.x, s.y); + return s; +} + +vec2 sub_fp64(vec2 a, vec2 b) { + vec2 s, t; + s = twoSub(a.x, b.x); + t = twoSub(a.y, b.y); + s.y += t.x; + s = quickTwoSum(s.x, s.y); + s.y += t.y; + s = quickTwoSum(s.x, s.y); + return s; +} + +vec2 mul_fp64(vec2 a, vec2 b) { + vec2 prod = twoProd(a.x, b.x); + // y component is for the error + prod.y += a.x * b.y; +#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND) + prod = split2(prod); +#endif + prod = quickTwoSum(prod.x, prod.y); + prod.y += a.y * b.x; +#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND) + prod = split2(prod); +#endif + prod = quickTwoSum(prod.x, prod.y); + return prod; +} + +vec2 div_fp64(vec2 a, vec2 b) { + float xn = 1.0 / b.x; +#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND) + vec2 yn = mul_fp64(a, vec2(xn, 0)); +#else + vec2 yn = a * xn; +#endif + float diff = (sub_fp64(a, mul_fp64(b, yn))).x; + vec2 prod = twoProd(xn, diff); + return sum_fp64(yn, prod); +} + +vec2 sqrt_fp64(vec2 a) { + if (a.x == 0.0 && a.y == 0.0) return vec2(0.0, 0.0); + if (a.x < 0.0) return vec2(0.0 / 0.0, 0.0 / 0.0); + + float x = 1.0 / sqrt(a.x); + float yn = a.x * x; +#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND) + vec2 yn_sqr = twoSqr(yn) * fp64.ONE; +#else + vec2 yn_sqr = twoSqr(yn); +#endif + float diff = sub_fp64(a, yn_sqr).x; + vec2 prod = twoProd(x * 0.5, diff); +#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND) + return sum_fp64(split(yn), prod); +#else + return sum_fp64(vec2(yn, 0.0), prod); +#endif +} +`; diff --git a/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts b/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts new file mode 100644 index 0000000000..516585f0aa --- /dev/null +++ b/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts @@ -0,0 +1,675 @@ +// luma.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +export const fp64functionShader = /* glsl */ `\ +const vec2 E_FP64 = vec2(2.7182817459106445e+00, 8.254840366817007e-08); +const vec2 LOG2_FP64 = vec2(0.6931471824645996e+00, -1.9046542121259336e-09); +const vec2 PI_FP64 = vec2(3.1415927410125732, -8.742278012618954e-8); +const vec2 TWO_PI_FP64 = vec2(6.2831854820251465, -1.7484556025237907e-7); +const vec2 PI_2_FP64 = vec2(1.5707963705062866, -4.371139006309477e-8); +const vec2 PI_4_FP64 = vec2(0.7853981852531433, -2.1855695031547384e-8); +const vec2 PI_16_FP64 = vec2(0.19634954631328583, -5.463923757886846e-9); +const vec2 PI_16_2_FP64 = vec2(0.39269909262657166, -1.0927847515773692e-8); +const vec2 PI_16_3_FP64 = vec2(0.5890486240386963, -1.4906100798128818e-9); +const vec2 PI_180_FP64 = vec2(0.01745329238474369, 1.3519960498364902e-10); + +const vec2 SIN_TABLE_0_FP64 = vec2(0.19509032368659973, -1.6704714833615242e-9); +const vec2 SIN_TABLE_1_FP64 = vec2(0.3826834261417389, 6.22335089017767e-9); +const vec2 SIN_TABLE_2_FP64 = vec2(0.5555702447891235, -1.1769521357507529e-8); +const vec2 SIN_TABLE_3_FP64 = vec2(0.7071067690849304, 1.2101617041793133e-8); + +const vec2 COS_TABLE_0_FP64 = vec2(0.9807852506637573, 2.9739473106360492e-8); +const vec2 COS_TABLE_1_FP64 = vec2(0.9238795042037964, 2.8307490351764386e-8); +const vec2 COS_TABLE_2_FP64 = vec2(0.8314695954322815, 1.6870263741530778e-8); +const vec2 COS_TABLE_3_FP64 = vec2(0.7071067690849304, 1.2101617152815436e-8); + +const vec2 INVERSE_FACTORIAL_3_FP64 = vec2(1.666666716337204e-01, -4.967053879312289e-09); // 1/3! +const vec2 INVERSE_FACTORIAL_4_FP64 = vec2(4.16666679084301e-02, -1.2417634698280722e-09); // 1/4! +const vec2 INVERSE_FACTORIAL_5_FP64 = vec2(8.333333767950535e-03, -4.34617203337595e-10); // 1/5! +const vec2 INVERSE_FACTORIAL_6_FP64 = vec2(1.3888889225199819e-03, -3.3631094437103215e-11); // 1/6! +const vec2 INVERSE_FACTORIAL_7_FP64 = vec2(1.9841270113829523e-04, -2.725596874933456e-12); // 1/7! +const vec2 INVERSE_FACTORIAL_8_FP64 = vec2(2.4801587642286904e-05, -3.406996025904184e-13); // 1/8! +const vec2 INVERSE_FACTORIAL_9_FP64 = vec2(2.75573188446287533e-06, 3.7935713937038186e-14); // 1/9! +const vec2 INVERSE_FACTORIAL_10_FP64 = vec2(2.755731998149713e-07, -7.575112367869873e-15); // 1/10! + +float nint(float d) { + if (d == floor(d)) return d; + return floor(d + 0.5); +} + +vec2 nint_fp64(vec2 a) { + float hi = nint(a.x); + float lo; + vec2 tmp; + if (hi == a.x) { + lo = nint(a.y); + tmp = quickTwoSum(hi, lo); + } else { + lo = 0.0; + if (abs(hi - a.x) == 0.5 && a.y < 0.0) { + hi -= 1.0; + } + tmp = vec2(hi, lo); + } + return tmp; +} + +/* k_power controls how much range reduction we would like to have +Range reduction uses the following method: +assume a = k_power * r + m * log(2), k and m being integers. +Set k_power = 4 (we can choose other k to trade accuracy with performance. +we only need to calculate exp(r) and using exp(a) = 2^m * exp(r)^k_power; +*/ + +vec2 exp_fp64(vec2 a) { + // We need to make sure these two numbers match + // as bit-wise shift is not available in GLSL 1.0 + const int k_power = 4; + const float k = 16.0; + + const float inv_k = 1.0 / k; + + if (a.x <= -88.0) return vec2(0.0, 0.0); + if (a.x >= 88.0) return vec2(1.0 / 0.0, 1.0 / 0.0); + if (a.x == 0.0 && a.y == 0.0) return vec2(1.0, 0.0); + if (a.x == 1.0 && a.y == 0.0) return E_FP64; + + float m = floor(a.x / LOG2_FP64.x + 0.5); + vec2 r = sub_fp64(a, mul_fp64(LOG2_FP64, vec2(m, 0.0))) * inv_k; + vec2 s, t, p; + + p = mul_fp64(r, r); + s = sum_fp64(r, p * 0.5); + p = mul_fp64(p, r); + t = mul_fp64(p, INVERSE_FACTORIAL_3_FP64); + + s = sum_fp64(s, t); + p = mul_fp64(p, r); + t = mul_fp64(p, INVERSE_FACTORIAL_4_FP64); + + s = sum_fp64(s, t); + p = mul_fp64(p, r); + t = mul_fp64(p, INVERSE_FACTORIAL_5_FP64); + + // s = sum_fp64(s, t); + // p = mul_fp64(p, r); + // t = mul_fp64(p, INVERSE_FACTORIAL_6_FP64); + + // s = sum_fp64(s, t); + // p = mul_fp64(p, r); + // t = mul_fp64(p, INVERSE_FACTORIAL_7_FP64); + + s = sum_fp64(s, t); + + + // At this point, s = exp(r) - 1; but after following 4 recursions, we will get exp(r) ^ 512 - 1. + for (int i = 0; i < k_power; i++) { + s = sum_fp64(s * 2.0, mul_fp64(s, s)); + } + +#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND) + s = sum_fp64(s, vec2(fp64.ONE, 0.0)); +#else + s = sum_fp64(s, vec2(1.0, 0.0)); +#endif + + return s * pow(2.0, m); +// return r; +} + +vec2 log_fp64(vec2 a) +{ + if (a.x == 1.0 && a.y == 0.0) return vec2(0.0, 0.0); + if (a.x <= 0.0) return vec2(0.0 / 0.0, 0.0 / 0.0); + vec2 x = vec2(log(a.x), 0.0); + vec2 s; +#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND) + s = vec2(fp64.ONE, 0.0); +#else + s = vec2(1.0, 0.0); +#endif + + x = sub_fp64(sum_fp64(x, mul_fp64(a, exp_fp64(-x))), s); + return x; +} + +vec2 sin_taylor_fp64(vec2 a) { + vec2 r, s, t, x; + + if (a.x == 0.0 && a.y == 0.0) { + return vec2(0.0, 0.0); + } + + x = -mul_fp64(a, a); + s = a; + r = a; + + r = mul_fp64(r, x); + t = mul_fp64(r, INVERSE_FACTORIAL_3_FP64); + s = sum_fp64(s, t); + + r = mul_fp64(r, x); + t = mul_fp64(r, INVERSE_FACTORIAL_5_FP64); + s = sum_fp64(s, t); + + /* keep the following commented code in case we need them + for extra accuracy from the Taylor expansion*/ + + // r = mul_fp64(r, x); + // t = mul_fp64(r, INVERSE_FACTORIAL_7_FP64); + // s = sum_fp64(s, t); + + // r = mul_fp64(r, x); + // t = mul_fp64(r, INVERSE_FACTORIAL_9_FP64); + // s = sum_fp64(s, t); + + return s; +} + +vec2 cos_taylor_fp64(vec2 a) { + vec2 r, s, t, x; + + if (a.x == 0.0 && a.y == 0.0) { + return vec2(1.0, 0.0); + } + + x = -mul_fp64(a, a); + r = x; + s = sum_fp64(vec2(1.0, 0.0), r * 0.5); + + r = mul_fp64(r, x); + t = mul_fp64(r, INVERSE_FACTORIAL_4_FP64); + s = sum_fp64(s, t); + + r = mul_fp64(r, x); + t = mul_fp64(r, INVERSE_FACTORIAL_6_FP64); + s = sum_fp64(s, t); + + /* keep the following commented code in case we need them + for extra accuracy from the Taylor expansion*/ + + // r = mul_fp64(r, x); + // t = mul_fp64(r, INVERSE_FACTORIAL_8_FP64); + // s = sum_fp64(s, t); + + // r = mul_fp64(r, x); + // t = mul_fp64(r, INVERSE_FACTORIAL_10_FP64); + // s = sum_fp64(s, t); + + return s; +} + +void sincos_taylor_fp64(vec2 a, out vec2 sin_t, out vec2 cos_t) { + if (a.x == 0.0 && a.y == 0.0) { + sin_t = vec2(0.0, 0.0); + cos_t = vec2(1.0, 0.0); + } + + sin_t = sin_taylor_fp64(a); + cos_t = sqrt_fp64(sub_fp64(vec2(1.0, 0.0), mul_fp64(sin_t, sin_t))); +} + +vec2 sin_fp64(vec2 a) { + if (a.x == 0.0 && a.y == 0.0) { + return vec2(0.0, 0.0); + } + + // 2pi range reduction + vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64)); + vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z)); + + vec2 t; + float q = floor(r.x / PI_2_FP64.x + 0.5); + int j = int(q); + + if (j < -2 || j > 2) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } + + t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0))); + + q = floor(t.x / PI_16_FP64.x + 0.5); + int k = int(q); + + if (k == 0) { + if (j == 0) { + return sin_taylor_fp64(t); + } else if (j == 1) { + return cos_taylor_fp64(t); + } else if (j == -1) { + return -cos_taylor_fp64(t); + } else { + return -sin_taylor_fp64(t); + } + } + + int abs_k = int(abs(float(k))); + + if (abs_k > 4) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } else { + t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0))); + } + + vec2 u = vec2(0.0, 0.0); + vec2 v = vec2(0.0, 0.0); + +#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND) + if (abs(float(abs_k) - 1.0) < 0.5) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs(float(abs_k) - 2.0) < 0.5) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs(float(abs_k) - 3.0) < 0.5) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs(float(abs_k) - 4.0) < 0.5) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#else + if (abs_k == 1) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs_k == 2) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs_k == 3) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs_k == 4) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#endif + + vec2 sin_t, cos_t; + sincos_taylor_fp64(t, sin_t, cos_t); + + + + vec2 result = vec2(0.0, 0.0); + if (j == 0) { + if (k > 0) { + result = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } else { + result = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } + } else if (j == 1) { + if (k > 0) { + result = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } else { + result = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } + } else if (j == -1) { + if (k > 0) { + result = sub_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t)); + } else { + result = -sum_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t)); + } + } else { + if (k > 0) { + result = -sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } else { + result = sub_fp64(mul_fp64(v, cos_t), mul_fp64(u, sin_t)); + } + } + + return result; +} + +vec2 cos_fp64(vec2 a) { + if (a.x == 0.0 && a.y == 0.0) { + return vec2(1.0, 0.0); + } + + // 2pi range reduction + vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64)); + vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z)); + + vec2 t; + float q = floor(r.x / PI_2_FP64.x + 0.5); + int j = int(q); + + if (j < -2 || j > 2) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } + + t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0))); + + q = floor(t.x / PI_16_FP64.x + 0.5); + int k = int(q); + + if (k == 0) { + if (j == 0) { + return cos_taylor_fp64(t); + } else if (j == 1) { + return -sin_taylor_fp64(t); + } else if (j == -1) { + return sin_taylor_fp64(t); + } else { + return -cos_taylor_fp64(t); + } + } + + int abs_k = int(abs(float(k))); + + if (abs_k > 4) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } else { + t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0))); + } + + vec2 u = vec2(0.0, 0.0); + vec2 v = vec2(0.0, 0.0); + +#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND) + if (abs(float(abs_k) - 1.0) < 0.5) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs(float(abs_k) - 2.0) < 0.5) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs(float(abs_k) - 3.0) < 0.5) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs(float(abs_k) - 4.0) < 0.5) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#else + if (abs_k == 1) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs_k == 2) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs_k == 3) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs_k == 4) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#endif + + vec2 sin_t, cos_t; + sincos_taylor_fp64(t, sin_t, cos_t); + + vec2 result = vec2(0.0, 0.0); + if (j == 0) { + if (k > 0) { + result = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } else { + result = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } + } else if (j == 1) { + if (k > 0) { + result = -sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } else { + result = sub_fp64(mul_fp64(v, cos_t), mul_fp64(u, sin_t)); + } + } else if (j == -1) { + if (k > 0) { + result = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } else { + result = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + } + } else { + if (k > 0) { + result = sub_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t)); + } else { + result = -sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } + } + + return result; +} + +vec2 tan_fp64(vec2 a) { + vec2 sin_a; + vec2 cos_a; + + if (a.x == 0.0 && a.y == 0.0) { + return vec2(0.0, 0.0); + } + + // 2pi range reduction + vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64)); + vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z)); + + vec2 t; + float q = floor(r.x / PI_2_FP64.x + 0.5); + int j = int(q); + + + if (j < -2 || j > 2) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } + + t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0))); + + q = floor(t.x / PI_16_FP64.x + 0.5); + int k = int(q); + int abs_k = int(abs(float(k))); + + // We just can't get PI/16 * 3.0 very accurately. + // so let's just store it + if (abs_k > 4) { + return vec2(0.0 / 0.0, 0.0 / 0.0); + } else { + t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0))); + } + + + vec2 u = vec2(0.0, 0.0); + vec2 v = vec2(0.0, 0.0); + + vec2 sin_t, cos_t; + vec2 s, c; + sincos_taylor_fp64(t, sin_t, cos_t); + + if (k == 0) { + s = sin_t; + c = cos_t; + } else { +#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND) + if (abs(float(abs_k) - 1.0) < 0.5) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs(float(abs_k) - 2.0) < 0.5) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs(float(abs_k) - 3.0) < 0.5) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs(float(abs_k) - 4.0) < 0.5) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#else + if (abs_k == 1) { + u = COS_TABLE_0_FP64; + v = SIN_TABLE_0_FP64; + } else if (abs_k == 2) { + u = COS_TABLE_1_FP64; + v = SIN_TABLE_1_FP64; + } else if (abs_k == 3) { + u = COS_TABLE_2_FP64; + v = SIN_TABLE_2_FP64; + } else if (abs_k == 4) { + u = COS_TABLE_3_FP64; + v = SIN_TABLE_3_FP64; + } +#endif + if (k > 0) { + s = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + c = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } else { + s = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t)); + c = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t)); + } + } + + if (j == 0) { + sin_a = s; + cos_a = c; + } else if (j == 1) { + sin_a = c; + cos_a = -s; + } else if (j == -1) { + sin_a = -c; + cos_a = s; + } else { + sin_a = -s; + cos_a = -c; + } + return div_fp64(sin_a, cos_a); +} + +vec2 radians_fp64(vec2 degree) { + return mul_fp64(degree, PI_180_FP64); +} + +vec2 mix_fp64(vec2 a, vec2 b, float x) { + vec2 range = sub_fp64(b, a); + return sum_fp64(a, mul_fp64(range, vec2(x, 0.0))); +} + +// Vector functions +// vec2 functions +void vec2_sum_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) { + out_val[0] = sum_fp64(a[0], b[0]); + out_val[1] = sum_fp64(a[1], b[1]); +} + +void vec2_sub_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) { + out_val[0] = sub_fp64(a[0], b[0]); + out_val[1] = sub_fp64(a[1], b[1]); +} + +void vec2_mul_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) { + out_val[0] = mul_fp64(a[0], b[0]); + out_val[1] = mul_fp64(a[1], b[1]); +} + +void vec2_div_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) { + out_val[0] = div_fp64(a[0], b[0]); + out_val[1] = div_fp64(a[1], b[1]); +} + +void vec2_mix_fp64(vec2 x[2], vec2 y[2], float a, out vec2 out_val[2]) { + vec2 range[2]; + vec2_sub_fp64(y, x, range); + vec2 portion[2]; + portion[0] = range[0] * a; + portion[1] = range[1] * a; + vec2_sum_fp64(x, portion, out_val); +} + +vec2 vec2_length_fp64(vec2 x[2]) { + return sqrt_fp64(sum_fp64(mul_fp64(x[0], x[0]), mul_fp64(x[1], x[1]))); +} + +void vec2_normalize_fp64(vec2 x[2], out vec2 out_val[2]) { + vec2 length = vec2_length_fp64(x); + vec2 length_vec2[2]; + length_vec2[0] = length; + length_vec2[1] = length; + + vec2_div_fp64(x, length_vec2, out_val); +} + +vec2 vec2_distance_fp64(vec2 x[2], vec2 y[2]) { + vec2 diff[2]; + vec2_sub_fp64(x, y, diff); + return vec2_length_fp64(diff); +} + +vec2 vec2_dot_fp64(vec2 a[2], vec2 b[2]) { + vec2 v[2]; + + v[0] = mul_fp64(a[0], b[0]); + v[1] = mul_fp64(a[1], b[1]); + + return sum_fp64(v[0], v[1]); +} + +// vec3 functions +void vec3_sub_fp64(vec2 a[3], vec2 b[3], out vec2 out_val[3]) { + for (int i = 0; i < 3; i++) { + out_val[i] = sum_fp64(a[i], b[i]); + } +} + +void vec3_sum_fp64(vec2 a[3], vec2 b[3], out vec2 out_val[3]) { + for (int i = 0; i < 3; i++) { + out_val[i] = sum_fp64(a[i], b[i]); + } +} + +vec2 vec3_length_fp64(vec2 x[3]) { + return sqrt_fp64(sum_fp64(sum_fp64(mul_fp64(x[0], x[0]), mul_fp64(x[1], x[1])), + mul_fp64(x[2], x[2]))); +} + +vec2 vec3_distance_fp64(vec2 x[3], vec2 y[3]) { + vec2 diff[3]; + vec3_sub_fp64(x, y, diff); + return vec3_length_fp64(diff); +} + +// vec4 functions +void vec4_fp64(vec4 a, out vec2 out_val[4]) { + out_val[0].x = a[0]; + out_val[0].y = 0.0; + + out_val[1].x = a[1]; + out_val[1].y = 0.0; + + out_val[2].x = a[2]; + out_val[2].y = 0.0; + + out_val[3].x = a[3]; + out_val[3].y = 0.0; +} + +void vec4_scalar_mul_fp64(vec2 a[4], vec2 b, out vec2 out_val[4]) { + out_val[0] = mul_fp64(a[0], b); + out_val[1] = mul_fp64(a[1], b); + out_val[2] = mul_fp64(a[2], b); + out_val[3] = mul_fp64(a[3], b); +} + +void vec4_sum_fp64(vec2 a[4], vec2 b[4], out vec2 out_val[4]) { + for (int i = 0; i < 4; i++) { + out_val[i] = sum_fp64(a[i], b[i]); + } +} + +void vec4_dot_fp64(vec2 a[4], vec2 b[4], out vec2 out_val) { + vec2 v[4]; + + v[0] = mul_fp64(a[0], b[0]); + v[1] = mul_fp64(a[1], b[1]); + v[2] = mul_fp64(a[2], b[2]); + v[3] = mul_fp64(a[3], b[3]); + + out_val = sum_fp64(sum_fp64(v[0], v[1]), sum_fp64(v[2], v[3])); +} + +void mat4_vec4_mul_fp64(vec2 b[16], vec2 a[4], out vec2 out_val[4]) { + vec2 tmp[4]; + + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + tmp[j] = b[j + i * 4]; + } + vec4_dot_fp64(a, tmp, out_val[i]); + } +} +`; diff --git a/modules/shadertools/src/modules/math/fp64/fp64.ts b/modules/shadertools/src/modules/math/fp64/fp64.ts new file mode 100644 index 0000000000..43ceefa903 --- /dev/null +++ b/modules/shadertools/src/modules/math/fp64/fp64.ts @@ -0,0 +1,55 @@ +// luma.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import {ShaderModule} from '../../../lib/shader-module/shader-module'; + +import {fp64ify, fp64LowPart, fp64ifyMatrix4} from '../../../modules/math/fp64/fp64-utils'; +import {fp64arithmeticShader} from './fp64-arithmetic-glsl'; +import {fp64functionShader} from './fp64-functions-glsl'; + +type FP64Props = {}; +type FP64Uniforms = {ONE: number}; +type FP64Bindings = {}; + +type FP64Utilities = { + fp64ify: typeof fp64ify; + fp64LowPart: typeof fp64LowPart; + fp64ifyMatrix4: typeof fp64ifyMatrix4; +}; + +const defaultUniforms: FP64Uniforms = { + // Used in LUMA_FP64_CODE_ELIMINATION_WORKAROUND + ONE: 1.0 +}; + +/** + * 64bit arithmetic: add, sub, mul, div (small subset of fp64 module) + */ +export const fp64arithmetic: ShaderModule & FP64Utilities = { + name: 'fp64arithmetic', + vs: fp64arithmeticShader, + defaultUniforms, + uniformTypes: {ONE: 'f32'}, + + // Additional Functions + fp64ify, + fp64LowPart, + fp64ifyMatrix4 +}; + +/** + * Full 64 bit math library + */ +export const fp64: ShaderModule<{}> & FP64Utilities = { + name: 'fp64', + vs: fp64functionShader, + dependencies: [fp64arithmetic], + + // Additional Functions + fp64ify, + fp64LowPart, + fp64ifyMatrix4 +}; + +export {fp64ify, fp64LowPart, fp64ifyMatrix4}; diff --git a/modules/shadertools/test/index.ts b/modules/shadertools/test/index.ts index ff16c33703..fc1ddbcda0 100644 --- a/modules/shadertools/test/index.ts +++ b/modules/shadertools/test/index.ts @@ -31,6 +31,7 @@ import './lib/shader-assembler.spec'; // Data utilities import './modules/math/fp16-utils.spec'; +import './modules/math/fp64-arithmetic-transform.spec'; import './modules/math/fp64-utils.spec'; // General modules tests diff --git a/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts b/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts new file mode 100644 index 0000000000..530ce1892d --- /dev/null +++ b/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts @@ -0,0 +1,155 @@ +// luma.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +import test from 'tape-promise/tape'; +import {getWebGLTestDevice} from '@luma.gl/test-utils'; +import {runTests} from './fp64-test-utils-transform'; + +// Failing test cases are ignored based on gpu and glslFunc, using ignoreFor field +// ignoreFor: [{gpu: ['glslFunc-1', 'glslFunc-2']}] => ignores for `'glslFunc-1' and 'glslFunc-2` when running on `gpu` +// Many of these tests fail on Apple GPUs with very large margins, see https://github.com/visgl/luma.gl/issues/1764. +const commonTestCases = [ + {a: 2, b: 2}, + {a: 0.1, b: 0.1, ignoreFor: {apple: ['sum_fp64', 'mul_fp64', 'div_fp64']}}, + {a: 3.0e-19, b: 3.3e13, ignoreFor: {apple: ['sum_fp64', 'sub_fp64']}}, + {a: 9.9e-40, b: 1.7e3, ignoreFor: {}}, + {a: 1.5e-36, b: 1.7e-16, ignoreFor: {}}, + {a: 9.4e-26, b: 51, ignoreFor: {}}, + {a: 6.7e-20, b: 0.93, ignoreFor: {apple: ['sum_fp64', 'sub_fp64']}}, + + // mul_fp64: Large numbers once multipled, can't be represented by 32 bit precision and Math.fround() returns NAN + // sqrt_fp64: Fail on INTEL with margin 3.906051071870294e-12 + { + a: 2.4e3, + b: 5.9e31, + ignoreFor: {all: ['mul_fp64'], intel: ['sqrt_fp64'], apple: ['sum_fp64', 'sub_fp64']} + }, + + // div_fp64 fails on INTEL with margin 1.7318642528355118e-12 + // sqrt_fp64 fails on INTEL with margin 1.5518878351528786e-12 + { + a: 1.4e9, + b: 6.3e5, + ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['mul_fp64', 'div_fp64']} + }, + + // div fails on INTEL with margin 1.7886288892678105e-14 + // sqrt fails on INTEL with margin 2.5362810256331708e-12 + {a: 3.0e9, b: 4.3e-23, ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']}}, + + // div fail on INTEL with margin 1.137354350370519e-12 + {a: 1.7e-19, b: 2.7e-27, ignoreFor: {intel: ['div_fp64'], apple: ['div_fp64']}}, + + // div_fp64 fails on INTEL with margin 2.7291999999999997e-12 + // sqrt_fp64 fails on INTEL with margin 3.501857471494295e-12 + {a: 0.3, b: 3.2e-16, ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']}}, + + // mul_fp64 : fails since result can't be represented by 32 bit floats + // div_fp64 : fails on INTEL with margin 1.9999999999999994e-15 + // sqrt_fp64 : fails on INTEL with margin 1.832115697751484e-12 + { + a: 4.1e30, + b: 8.2e15, + ignoreFor: {all: ['mul_fp64'], intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']} + }, + + // Fails on INTEL, margin 3.752606081210107e-12 + { + a: 6.2e3, + b: 6.3e10, + ignoreFor: {intel: ['sqrt_fp64'], apple: ['sum_fp64', 'mul_fp64', 'sub_fp64']} + }, + // Fails on INTEL, margin 3.872578286363912e-13 + {a: 2.5e2, b: 5.1e-21, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}}, + // Fails on INTEL, margin 1.5332142001740705e-12 + {a: 96, b: 1.7e4, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}}, + // // Fail on INTEL, margin 1.593162047558726e-12 + { + a: 0.27, + b: 2.3e16, + ignoreFor: {intel: ['sqrt_fp64'], apple: ['sum_fp64', 'mul_fp64', 'sub_fp64']} + }, + // Fails on INTEL, margin 1.014956357028767e-12 + {a: 18, b: 9.1e-9, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}} +]; + +// Filter all tests cases based on current gpu and glsFunc +function getTestCasesFor(device, glslFunc) { + const debugInfo = device.info; + const testCases = commonTestCases.filter(testCase => { + if (testCase.ignoreFor) { + for (const gpu in testCase.ignoreFor) { + if ( + (gpu === 'all' || debugInfo.vendor.toLowerCase().indexOf(gpu) >= 0) && + testCase.ignoreFor[gpu].includes(glslFunc) + ) { + return false; + } + } + } + return true; + }); + return testCases; +} + +test('fp64#sum_fp64', async t => { + console.log('start'); + const webglDevice = await getWebGLTestDevice(); + console.log('device'); + + const glslFunc = 'sum_fp64'; + const testCases = getTestCasesFor(webglDevice, glslFunc); + await runTests(webglDevice, {glslFunc, binary: true, op: (a, b) => a + b, testCases, t}); + t.end(); +}); + +test('fp64#sub_fp64', async t => { + const webglDevice = await getWebGLTestDevice(); + + const glslFunc = 'sub_fp64'; + const testCases = getTestCasesFor(webglDevice, glslFunc); + await runTests(webglDevice, {glslFunc, binary: true, op: (a, b) => a - b, testCases, t}); + t.end(); +}); + +test('fp64#mul_fp64', async t => { + const webglDevice = await getWebGLTestDevice(); + + const glslFunc = 'mul_fp64'; + const testCases = getTestCasesFor(webglDevice, glslFunc); + await runTests(webglDevice, { + glslFunc, + binary: true, + op: (a, b) => a * b, + limit: 128, + testCases, + t + }); + t.end(); +}); + +test('fp64#div_fp64', async t => { + const webglDevice = await getWebGLTestDevice(); + + const glslFunc = 'div_fp64'; + const testCases = getTestCasesFor(webglDevice, glslFunc); + await runTests(webglDevice, { + glslFunc, + binary: true, + op: (a, b) => a / b, + limit: 128, + testCases, + t + }); + t.end(); +}); + +test('fp64#sqrt_fp64', async t => { + const webglDevice = await getWebGLTestDevice(); + + const glslFunc = 'sqrt_fp64'; + const testCases = getTestCasesFor(webglDevice, glslFunc); + await runTests(webglDevice, {glslFunc, op: a => Math.sqrt(a), limit: 128, testCases, t}); + t.end(); +}); diff --git a/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts b/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts new file mode 100644 index 0000000000..5f778cf2bd --- /dev/null +++ b/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts @@ -0,0 +1,130 @@ +// luma.gl +// SPDX-License-Identifier: MIT +// Copyright (c) vis.gl contributors + +// Special utility functions for df64 tests + +/* eslint-disable camelcase, prefer-template, max-len */ + +import {Device} from '@luma.gl/core'; +import {BufferTransform} from '@luma.gl/engine'; +import {fp64} from '@luma.gl/shadertools'; +import {equals, config} from '@math.gl/core'; +const {fp64ify} = fp64; + +// Use 'invariant' specifier to work around some issues on Apple GPUs. The +// specifier may or may not have an effect, depending on the browser and the +// ANGLE backend, but it's an improvement when it's supported. +// See: https://github.com/visgl/luma.gl/issues/1764 + +function getBinaryShader(operation: string): string { + const shader = `\ +#version 300 es +in vec2 a; +in vec2 b; +invariant out vec2 result; +void main(void) { + result = ${operation}(a, b); +} +`; + return shader; +} + +function getUnaryShader(operation: string): string { + return `\ +#version 300 es +in vec2 a; +in vec2 b; +invariant out vec2 result; +void main(void) { + result = ${operation}(a); +} +`; +} + +config.EPSILON = 1e-11; + +function setupFloatData({limit, op, testCases}) { + const count = testCases.length; + const a_fp64 = new Float32Array(count * 2); + const b_fp64 = new Float32Array(count * 2); + const expected_fp64 = new Float32Array(count * 2); + const a = new Array(count); + const b = new Array(count); + const expected = new Array(count); + for (let idx = 0; idx < count; idx++) { + const index = idx * 2; + a[idx] = testCases[idx].a; + b[idx] = testCases[idx].b; + expected[idx] = op(a[idx], b[idx]); + + fp64ify(a[idx], a_fp64, index); + fp64ify(b[idx], b_fp64, index); + fp64ify(expected[idx], expected_fp64, index); + } + return {a, b, expected, a_fp64, b_fp64, expected_fp64}; +} + +function setupFloatTest(device: Device, {glslFunc, binary = false, limit = 256, op, testCases}) { + const {a, b, expected, a_fp64, b_fp64, expected_fp64} = setupFloatData({limit, op, testCases}); + const vs = binary ? getBinaryShader(glslFunc) : getUnaryShader(glslFunc); + const bufferA = device.createBuffer({data: a_fp64}); + const bufferB = device.createBuffer({data: b_fp64}); + const bufferResult = device.createBuffer({byteLength: a_fp64.byteLength}); + const transform = new BufferTransform(device, { + vs, + modules: [fp64], + attributes: {a: bufferA, b: bufferB}, + bufferLayout: [ + {name: 'a', format: 'float32x2'}, + {name: 'b', format: 'float32x2'} + ], + feedbackBuffers: {result: bufferResult}, + outputs: ['result'], + vertexCount: testCases.length + }); + return {a, b, expected, a_fp64, b_fp64, expected_fp64, transform}; +} + +export async function runTests( + device: Device, + {glslFunc, binary = false, op, limit = 256, testCases, t} +) { + if (!BufferTransform.isSupported(device)) { + t.comment('Transform not supported, skipping tests'); + t.end(); + return; + } + + const {a, b, a_fp64, b_fp64, expected_fp64, transform} = setupFloatTest(device, { + glslFunc, + binary, + op, + limit, + testCases + }); + + transform.run(); + + const {buffer, byteOffset, byteLength} = await transform.readAsync('result'); + const gpuResult = new Float32Array( + buffer, + byteOffset, + byteLength / Float32Array.BYTES_PER_ELEMENT + ); + for (let idx = 0; idx < testCases.length; idx++) { + const reference64 = expected_fp64[2 * idx] + expected_fp64[2 * idx + 1]; + const result64 = gpuResult[2 * idx] + gpuResult[2 * idx + 1]; + + const args = binary + ? `(${a[idx].toPrecision(2)}, ${b[idx].toPrecision(2)})` + : `(${a[idx].toPrecision(2)})`; + const message = `${glslFunc}${args} error within tolerance`; + const isEqual = equals(reference64, result64); + t.ok(isEqual, message); + if (!isEqual) { + t.comment(` (tested ${a_fp64.toString()}, ${b_fp64.toString()})`); + t.comment(` (reference ${reference64}, result64 ${result64})`); + } + } +}