From c3776bbccc6e0f8b5b4654ccece7016a950973ec Mon Sep 17 00:00:00 2001
From: felixpalmer <felixpalmer@gmail.com>
Date: Fri, 20 Sep 2024 19:24:59 +0200
Subject: [PATCH] chore(shadertools): Port fp64 module to UBO (#2262)

---
 modules/shadertools/src/index.ts              |   4 +-
 .../modules/math/fp64/fp64-arithmetic-glsl.ts | 174 +++++
 .../modules/math/fp64/fp64-functions-glsl.ts  | 675 ++++++++++++++++++
 .../shadertools/src/modules/math/fp64/fp64.ts |  55 ++
 modules/shadertools/test/index.ts             |   1 +
 .../math/fp64-arithmetic-transform.spec.ts    | 155 ++++
 .../modules/math/fp64-test-utils-transform.ts | 130 ++++
 7 files changed, 1192 insertions(+), 2 deletions(-)
 create mode 100644 modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts
 create mode 100644 modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts
 create mode 100644 modules/shadertools/src/modules/math/fp64/fp64.ts
 create mode 100644 modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts
 create mode 100644 modules/shadertools/test/modules/math/fp64-test-utils-transform.ts

diff --git a/modules/shadertools/src/index.ts b/modules/shadertools/src/index.ts
index 8996b6f626..1e4f395295 100644
--- a/modules/shadertools/src/index.ts
+++ b/modules/shadertools/src/index.ts
@@ -63,7 +63,7 @@ export {fp64ify, fp64LowPart, fp64ifyMatrix4} from './modules/math/fp64/fp64-uti
 export {random} from './modules/math/random/random';
 
 export {fp32} from './modules/math/fp32/fp32';
-// export {fp64, fp64arithmetic} from './modules/math/fp64/fp64';
+export {fp64, fp64arithmetic} from './modules/math/fp64/fp64';
 
 // engine shader modules
 
@@ -93,7 +93,7 @@ export {pbrMaterial} from './modules/lighting/pbr-material/pbr-material';
 // DEPRECATED - v8 legacy shader modules (non-uniform buffer)
 
 // math libraries
-export {fp64, fp64arithmetic} from './modules-webgl1/math/fp64/fp64';
+// export {fp64, fp64arithmetic} from './modules-webgl1/math/fp64/fp64';
 
 // projection and lighting
 export {geometry as geometry1} from './modules-webgl1/geometry/geometry';
diff --git a/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts b/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts
new file mode 100644
index 0000000000..7ef5e544de
--- /dev/null
+++ b/modules/shadertools/src/modules/math/fp64/fp64-arithmetic-glsl.ts
@@ -0,0 +1,174 @@
+// luma.gl
+// SPDX-License-Identifier: MIT
+// Copyright (c) vis.gl contributors
+
+export const fp64arithmeticShader = /* glsl */ `\
+
+uniform fp64arithmeticUniforms {
+  uniform float ONE;
+} fp64;
+
+/*
+About LUMA_FP64_CODE_ELIMINATION_WORKAROUND
+
+The purpose of this workaround is to prevent shader compilers from
+optimizing away necessary arithmetic operations by swapping their sequences
+or transform the equation to some 'equivalent' form.
+
+The method is to multiply an artifical variable, ONE, which will be known to
+the compiler to be 1 only at runtime. The whole expression is then represented
+as a polynomial with respective to ONE. In the coefficients of all terms, only one a
+and one b should appear
+
+err = (a + b) * ONE^6 - a * ONE^5 - (a + b) * ONE^4 + a * ONE^3 - b - (a + b) * ONE^2 + a * ONE
+*/
+
+// Divide float number to high and low floats to extend fraction bits
+vec2 split(float a) {
+  const float SPLIT = 4097.0;
+  float t = a * SPLIT;
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  float a_hi = t * fp64.ONE - (t - a);
+  float a_lo = a * fp64.ONE - a_hi;
+#else
+  float a_hi = t - (t - a);
+  float a_lo = a - a_hi;
+#endif
+  return vec2(a_hi, a_lo);
+}
+
+// Divide float number again when high float uses too many fraction bits
+vec2 split2(vec2 a) {
+  vec2 b = split(a.x);
+  b.y += a.y;
+  return b;
+}
+
+// Special sum operation when a > b
+vec2 quickTwoSum(float a, float b) {
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  float sum = (a + b) * fp64.ONE;
+  float err = b - (sum - a) * fp64.ONE;
+#else
+  float sum = a + b;
+  float err = b - (sum - a);
+#endif
+  return vec2(sum, err);
+}
+
+// General sum operation
+vec2 twoSum(float a, float b) {
+  float s = (a + b);
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  float v = (s * fp64.ONE - a) * fp64.ONE;
+  float err = (a - (s - v) * fp64.ONE) * fp64.ONE * fp64.ONE * fp64.ONE + (b - v);
+#else
+  float v = s - a;
+  float err = (a - (s - v)) + (b - v);
+#endif
+  return vec2(s, err);
+}
+
+vec2 twoSub(float a, float b) {
+  float s = (a - b);
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  float v = (s * fp64.ONE - a) * fp64.ONE;
+  float err = (a - (s - v) * fp64.ONE) * fp64.ONE * fp64.ONE * fp64.ONE - (b + v);
+#else
+  float v = s - a;
+  float err = (a - (s - v)) - (b + v);
+#endif
+  return vec2(s, err);
+}
+
+vec2 twoSqr(float a) {
+  float prod = a * a;
+  vec2 a_fp64 = split(a);
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  float err = ((a_fp64.x * a_fp64.x - prod) * fp64.ONE + 2.0 * a_fp64.x *
+    a_fp64.y * fp64.ONE * fp64.ONE) + a_fp64.y * a_fp64.y * fp64.ONE * fp64.ONE * fp64.ONE;
+#else
+  float err = ((a_fp64.x * a_fp64.x - prod) + 2.0 * a_fp64.x * a_fp64.y) + a_fp64.y * a_fp64.y;
+#endif
+  return vec2(prod, err);
+}
+
+vec2 twoProd(float a, float b) {
+  float prod = a * b;
+  vec2 a_fp64 = split(a);
+  vec2 b_fp64 = split(b);
+  float err = ((a_fp64.x * b_fp64.x - prod) + a_fp64.x * b_fp64.y +
+    a_fp64.y * b_fp64.x) + a_fp64.y * b_fp64.y;
+  return vec2(prod, err);
+}
+
+vec2 sum_fp64(vec2 a, vec2 b) {
+  vec2 s, t;
+  s = twoSum(a.x, b.x);
+  t = twoSum(a.y, b.y);
+  s.y += t.x;
+  s = quickTwoSum(s.x, s.y);
+  s.y += t.y;
+  s = quickTwoSum(s.x, s.y);
+  return s;
+}
+
+vec2 sub_fp64(vec2 a, vec2 b) {
+  vec2 s, t;
+  s = twoSub(a.x, b.x);
+  t = twoSub(a.y, b.y);
+  s.y += t.x;
+  s = quickTwoSum(s.x, s.y);
+  s.y += t.y;
+  s = quickTwoSum(s.x, s.y);
+  return s;
+}
+
+vec2 mul_fp64(vec2 a, vec2 b) {
+  vec2 prod = twoProd(a.x, b.x);
+  // y component is for the error
+  prod.y += a.x * b.y;
+#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND)
+  prod = split2(prod);
+#endif
+  prod = quickTwoSum(prod.x, prod.y);
+  prod.y += a.y * b.x;
+#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND)
+  prod = split2(prod);
+#endif
+  prod = quickTwoSum(prod.x, prod.y);
+  return prod;
+}
+
+vec2 div_fp64(vec2 a, vec2 b) {
+  float xn = 1.0 / b.x;
+#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND)
+  vec2 yn = mul_fp64(a, vec2(xn, 0));
+#else
+  vec2 yn = a * xn;
+#endif
+  float diff = (sub_fp64(a, mul_fp64(b, yn))).x;
+  vec2 prod = twoProd(xn, diff);
+  return sum_fp64(yn, prod);
+}
+
+vec2 sqrt_fp64(vec2 a) {
+  if (a.x == 0.0 && a.y == 0.0) return vec2(0.0, 0.0);
+  if (a.x < 0.0) return vec2(0.0 / 0.0, 0.0 / 0.0);
+
+  float x = 1.0 / sqrt(a.x);
+  float yn = a.x * x;
+#if defined(LUMA_FP64_CODE_ELIMINATION_WORKAROUND)
+  vec2 yn_sqr = twoSqr(yn) * fp64.ONE;
+#else
+  vec2 yn_sqr = twoSqr(yn);
+#endif
+  float diff = sub_fp64(a, yn_sqr).x;
+  vec2 prod = twoProd(x * 0.5, diff);
+#if defined(LUMA_FP64_HIGH_BITS_OVERFLOW_WORKAROUND)
+  return sum_fp64(split(yn), prod);
+#else
+  return sum_fp64(vec2(yn, 0.0), prod);
+#endif
+}
+`;
diff --git a/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts b/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts
new file mode 100644
index 0000000000..516585f0aa
--- /dev/null
+++ b/modules/shadertools/src/modules/math/fp64/fp64-functions-glsl.ts
@@ -0,0 +1,675 @@
+// luma.gl
+// SPDX-License-Identifier: MIT
+// Copyright (c) vis.gl contributors
+
+export const fp64functionShader = /* glsl */ `\
+const vec2 E_FP64 = vec2(2.7182817459106445e+00, 8.254840366817007e-08);
+const vec2 LOG2_FP64 = vec2(0.6931471824645996e+00, -1.9046542121259336e-09);
+const vec2 PI_FP64 = vec2(3.1415927410125732, -8.742278012618954e-8);
+const vec2 TWO_PI_FP64 = vec2(6.2831854820251465, -1.7484556025237907e-7);
+const vec2 PI_2_FP64 = vec2(1.5707963705062866, -4.371139006309477e-8);
+const vec2 PI_4_FP64 = vec2(0.7853981852531433, -2.1855695031547384e-8);
+const vec2 PI_16_FP64 = vec2(0.19634954631328583, -5.463923757886846e-9);
+const vec2 PI_16_2_FP64 = vec2(0.39269909262657166, -1.0927847515773692e-8);
+const vec2 PI_16_3_FP64 = vec2(0.5890486240386963, -1.4906100798128818e-9);
+const vec2 PI_180_FP64 = vec2(0.01745329238474369, 1.3519960498364902e-10);
+
+const vec2 SIN_TABLE_0_FP64 = vec2(0.19509032368659973, -1.6704714833615242e-9);
+const vec2 SIN_TABLE_1_FP64 = vec2(0.3826834261417389, 6.22335089017767e-9);
+const vec2 SIN_TABLE_2_FP64 = vec2(0.5555702447891235, -1.1769521357507529e-8);
+const vec2 SIN_TABLE_3_FP64 = vec2(0.7071067690849304, 1.2101617041793133e-8);
+
+const vec2 COS_TABLE_0_FP64 = vec2(0.9807852506637573, 2.9739473106360492e-8);
+const vec2 COS_TABLE_1_FP64 = vec2(0.9238795042037964, 2.8307490351764386e-8);
+const vec2 COS_TABLE_2_FP64 = vec2(0.8314695954322815, 1.6870263741530778e-8);
+const vec2 COS_TABLE_3_FP64 = vec2(0.7071067690849304, 1.2101617152815436e-8);
+
+const vec2 INVERSE_FACTORIAL_3_FP64 = vec2(1.666666716337204e-01, -4.967053879312289e-09); // 1/3!
+const vec2 INVERSE_FACTORIAL_4_FP64 = vec2(4.16666679084301e-02, -1.2417634698280722e-09); // 1/4!
+const vec2 INVERSE_FACTORIAL_5_FP64 = vec2(8.333333767950535e-03, -4.34617203337595e-10); // 1/5!
+const vec2 INVERSE_FACTORIAL_6_FP64 = vec2(1.3888889225199819e-03, -3.3631094437103215e-11); // 1/6!
+const vec2 INVERSE_FACTORIAL_7_FP64 = vec2(1.9841270113829523e-04,  -2.725596874933456e-12); // 1/7!
+const vec2 INVERSE_FACTORIAL_8_FP64 = vec2(2.4801587642286904e-05, -3.406996025904184e-13); // 1/8!
+const vec2 INVERSE_FACTORIAL_9_FP64 = vec2(2.75573188446287533e-06, 3.7935713937038186e-14); // 1/9!
+const vec2 INVERSE_FACTORIAL_10_FP64 = vec2(2.755731998149713e-07, -7.575112367869873e-15); // 1/10!
+
+float nint(float d) {
+    if (d == floor(d)) return d;
+    return floor(d + 0.5);
+}
+
+vec2 nint_fp64(vec2 a) {
+    float hi = nint(a.x);
+    float lo;
+    vec2 tmp;
+    if (hi == a.x) {
+        lo = nint(a.y);
+        tmp = quickTwoSum(hi, lo);
+    } else {
+        lo = 0.0;
+        if (abs(hi - a.x) == 0.5 && a.y < 0.0) {
+            hi -= 1.0;
+        }
+        tmp = vec2(hi, lo);
+    }
+    return tmp;
+}
+
+/* k_power controls how much range reduction we would like to have
+Range reduction uses the following method:
+assume a = k_power * r + m * log(2), k and m being integers.
+Set k_power = 4 (we can choose other k to trade accuracy with performance.
+we only need to calculate exp(r) and using exp(a) = 2^m * exp(r)^k_power;
+*/
+
+vec2 exp_fp64(vec2 a) {
+  // We need to make sure these two numbers match
+  // as bit-wise shift is not available in GLSL 1.0
+  const int k_power = 4;
+  const float k = 16.0;
+
+  const float inv_k = 1.0 / k;
+
+  if (a.x <= -88.0) return vec2(0.0, 0.0);
+  if (a.x >= 88.0) return vec2(1.0 / 0.0, 1.0 / 0.0);
+  if (a.x == 0.0 && a.y == 0.0) return vec2(1.0, 0.0);
+  if (a.x == 1.0 && a.y == 0.0) return E_FP64;
+
+  float m = floor(a.x / LOG2_FP64.x + 0.5);
+  vec2 r = sub_fp64(a, mul_fp64(LOG2_FP64, vec2(m, 0.0))) * inv_k;
+  vec2 s, t, p;
+
+  p = mul_fp64(r, r);
+  s = sum_fp64(r, p * 0.5);
+  p = mul_fp64(p, r);
+  t = mul_fp64(p, INVERSE_FACTORIAL_3_FP64);
+
+  s = sum_fp64(s, t);
+  p = mul_fp64(p, r);
+  t = mul_fp64(p, INVERSE_FACTORIAL_4_FP64);
+
+  s = sum_fp64(s, t);
+  p = mul_fp64(p, r);
+  t = mul_fp64(p, INVERSE_FACTORIAL_5_FP64);
+
+  // s = sum_fp64(s, t);
+  // p = mul_fp64(p, r);
+  // t = mul_fp64(p, INVERSE_FACTORIAL_6_FP64);
+
+  // s = sum_fp64(s, t);
+  // p = mul_fp64(p, r);
+  // t = mul_fp64(p, INVERSE_FACTORIAL_7_FP64);
+
+  s = sum_fp64(s, t);
+
+
+  // At this point, s = exp(r) - 1; but after following 4 recursions, we will get exp(r) ^ 512 - 1.
+  for (int i = 0; i < k_power; i++) {
+    s = sum_fp64(s * 2.0, mul_fp64(s, s));
+  }
+
+#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND)
+  s = sum_fp64(s, vec2(fp64.ONE, 0.0));
+#else
+  s = sum_fp64(s, vec2(1.0, 0.0));
+#endif
+
+  return s * pow(2.0, m);
+//   return r;
+}
+
+vec2 log_fp64(vec2 a)
+{
+  if (a.x == 1.0 && a.y == 0.0) return vec2(0.0, 0.0);
+  if (a.x <= 0.0) return vec2(0.0 / 0.0, 0.0 / 0.0);
+  vec2 x = vec2(log(a.x), 0.0);
+  vec2 s;
+#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND)
+  s = vec2(fp64.ONE, 0.0);
+#else
+  s = vec2(1.0, 0.0);
+#endif
+
+  x = sub_fp64(sum_fp64(x, mul_fp64(a, exp_fp64(-x))), s);
+  return x;
+}
+
+vec2 sin_taylor_fp64(vec2 a) {
+  vec2 r, s, t, x;
+
+  if (a.x == 0.0 && a.y == 0.0) {
+    return vec2(0.0, 0.0);
+  }
+
+  x = -mul_fp64(a, a);
+  s = a;
+  r = a;
+
+  r = mul_fp64(r, x);
+  t = mul_fp64(r, INVERSE_FACTORIAL_3_FP64);
+  s = sum_fp64(s, t);
+
+  r = mul_fp64(r, x);
+  t = mul_fp64(r, INVERSE_FACTORIAL_5_FP64);
+  s = sum_fp64(s, t);
+
+  /* keep the following commented code in case we need them
+  for extra accuracy from the Taylor expansion*/
+
+  // r = mul_fp64(r, x);
+  // t = mul_fp64(r, INVERSE_FACTORIAL_7_FP64);
+  // s = sum_fp64(s, t);
+
+  // r = mul_fp64(r, x);
+  // t = mul_fp64(r, INVERSE_FACTORIAL_9_FP64);
+  // s = sum_fp64(s, t);
+
+  return s;
+}
+
+vec2 cos_taylor_fp64(vec2 a) {
+  vec2 r, s, t, x;
+
+  if (a.x == 0.0 && a.y == 0.0) {
+    return vec2(1.0, 0.0);
+  }
+
+  x = -mul_fp64(a, a);
+  r = x;
+  s = sum_fp64(vec2(1.0, 0.0), r * 0.5);
+
+  r = mul_fp64(r, x);
+  t = mul_fp64(r, INVERSE_FACTORIAL_4_FP64);
+  s = sum_fp64(s, t);
+
+  r = mul_fp64(r, x);
+  t = mul_fp64(r, INVERSE_FACTORIAL_6_FP64);
+  s = sum_fp64(s, t);
+
+  /* keep the following commented code in case we need them
+  for extra accuracy from the Taylor expansion*/
+
+  // r = mul_fp64(r, x);
+  // t = mul_fp64(r, INVERSE_FACTORIAL_8_FP64);
+  // s = sum_fp64(s, t);
+
+  // r = mul_fp64(r, x);
+  // t = mul_fp64(r, INVERSE_FACTORIAL_10_FP64);
+  // s = sum_fp64(s, t);
+
+  return s;
+}
+
+void sincos_taylor_fp64(vec2 a, out vec2 sin_t, out vec2 cos_t) {
+  if (a.x == 0.0 && a.y == 0.0) {
+    sin_t = vec2(0.0, 0.0);
+    cos_t = vec2(1.0, 0.0);
+  }
+
+  sin_t = sin_taylor_fp64(a);
+  cos_t = sqrt_fp64(sub_fp64(vec2(1.0, 0.0), mul_fp64(sin_t, sin_t)));
+}
+
+vec2 sin_fp64(vec2 a) {
+    if (a.x == 0.0 && a.y == 0.0) {
+        return vec2(0.0, 0.0);
+    }
+
+    // 2pi range reduction
+    vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64));
+    vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z));
+
+    vec2 t;
+    float q = floor(r.x / PI_2_FP64.x + 0.5);
+    int j = int(q);
+
+    if (j < -2 || j > 2) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    }
+
+    t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0)));
+
+    q = floor(t.x / PI_16_FP64.x + 0.5);
+    int k = int(q);
+
+    if (k == 0) {
+        if (j == 0) {
+            return sin_taylor_fp64(t);
+        } else if (j == 1) {
+            return cos_taylor_fp64(t);
+        } else if (j == -1) {
+            return -cos_taylor_fp64(t);
+        } else {
+            return -sin_taylor_fp64(t);
+        }
+    }
+
+    int abs_k = int(abs(float(k)));
+
+    if (abs_k > 4) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    } else {
+        t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0)));
+    }
+
+    vec2 u = vec2(0.0, 0.0);
+    vec2 v = vec2(0.0, 0.0);
+
+#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND)
+    if (abs(float(abs_k) - 1.0) < 0.5) {
+        u = COS_TABLE_0_FP64;
+        v = SIN_TABLE_0_FP64;
+    } else if (abs(float(abs_k) - 2.0) < 0.5) {
+        u = COS_TABLE_1_FP64;
+        v = SIN_TABLE_1_FP64;
+    } else if (abs(float(abs_k) - 3.0) < 0.5) {
+        u = COS_TABLE_2_FP64;
+        v = SIN_TABLE_2_FP64;
+    } else if (abs(float(abs_k) - 4.0) < 0.5) {
+        u = COS_TABLE_3_FP64;
+        v = SIN_TABLE_3_FP64;
+    }
+#else
+    if (abs_k == 1) {
+        u = COS_TABLE_0_FP64;
+        v = SIN_TABLE_0_FP64;
+    } else if (abs_k == 2) {
+        u = COS_TABLE_1_FP64;
+        v = SIN_TABLE_1_FP64;
+    } else if (abs_k == 3) {
+        u = COS_TABLE_2_FP64;
+        v = SIN_TABLE_2_FP64;
+    } else if (abs_k == 4) {
+        u = COS_TABLE_3_FP64;
+        v = SIN_TABLE_3_FP64;
+    }
+#endif
+
+    vec2 sin_t, cos_t;
+    sincos_taylor_fp64(t, sin_t, cos_t);
+
+
+
+    vec2 result = vec2(0.0, 0.0);
+    if (j == 0) {
+        if (k > 0) {
+            result = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        } else {
+            result = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        }
+    } else if (j == 1) {
+        if (k > 0) {
+            result = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        } else {
+            result = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        }
+    } else if (j == -1) {
+        if (k > 0) {
+            result = sub_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t));
+        } else {
+            result = -sum_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t));
+        }
+    } else {
+        if (k > 0) {
+            result = -sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        } else {
+            result = sub_fp64(mul_fp64(v, cos_t), mul_fp64(u, sin_t));
+        }
+    }
+
+    return result;
+}
+
+vec2 cos_fp64(vec2 a) {
+    if (a.x == 0.0 && a.y == 0.0) {
+        return vec2(1.0, 0.0);
+    }
+
+    // 2pi range reduction
+    vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64));
+    vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z));
+
+    vec2 t;
+    float q = floor(r.x / PI_2_FP64.x + 0.5);
+    int j = int(q);
+
+    if (j < -2 || j > 2) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    }
+
+    t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0)));
+
+    q = floor(t.x / PI_16_FP64.x + 0.5);
+    int k = int(q);
+
+    if (k == 0) {
+        if (j == 0) {
+            return cos_taylor_fp64(t);
+        } else if (j == 1) {
+            return -sin_taylor_fp64(t);
+        } else if (j == -1) {
+            return sin_taylor_fp64(t);
+        } else {
+            return -cos_taylor_fp64(t);
+        }
+    }
+
+    int abs_k = int(abs(float(k)));
+
+    if (abs_k > 4) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    } else {
+        t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0)));
+    }
+
+    vec2 u = vec2(0.0, 0.0);
+    vec2 v = vec2(0.0, 0.0);
+
+#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND)
+    if (abs(float(abs_k) - 1.0) < 0.5) {
+        u = COS_TABLE_0_FP64;
+        v = SIN_TABLE_0_FP64;
+    } else if (abs(float(abs_k) - 2.0) < 0.5) {
+        u = COS_TABLE_1_FP64;
+        v = SIN_TABLE_1_FP64;
+    } else if (abs(float(abs_k) - 3.0) < 0.5) {
+        u = COS_TABLE_2_FP64;
+        v = SIN_TABLE_2_FP64;
+    } else if (abs(float(abs_k) - 4.0) < 0.5) {
+        u = COS_TABLE_3_FP64;
+        v = SIN_TABLE_3_FP64;
+    }
+#else
+    if (abs_k == 1) {
+        u = COS_TABLE_0_FP64;
+        v = SIN_TABLE_0_FP64;
+    } else if (abs_k == 2) {
+        u = COS_TABLE_1_FP64;
+        v = SIN_TABLE_1_FP64;
+    } else if (abs_k == 3) {
+        u = COS_TABLE_2_FP64;
+        v = SIN_TABLE_2_FP64;
+    } else if (abs_k == 4) {
+        u = COS_TABLE_3_FP64;
+        v = SIN_TABLE_3_FP64;
+    }
+#endif
+
+    vec2 sin_t, cos_t;
+    sincos_taylor_fp64(t, sin_t, cos_t);
+
+    vec2 result = vec2(0.0, 0.0);
+    if (j == 0) {
+        if (k > 0) {
+            result = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        } else {
+            result = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        }
+    } else if (j == 1) {
+        if (k > 0) {
+            result = -sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        } else {
+            result = sub_fp64(mul_fp64(v, cos_t), mul_fp64(u, sin_t));
+        }
+    } else if (j == -1) {
+        if (k > 0) {
+            result = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        } else {
+            result = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+        }
+    } else {
+        if (k > 0) {
+            result = sub_fp64(mul_fp64(v, sin_t), mul_fp64(u, cos_t));
+        } else {
+            result = -sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        }
+    }
+
+    return result;
+}
+
+vec2 tan_fp64(vec2 a) {
+    vec2 sin_a;
+    vec2 cos_a;
+
+    if (a.x == 0.0 && a.y == 0.0) {
+        return vec2(0.0, 0.0);
+    }
+
+    // 2pi range reduction
+    vec2 z = nint_fp64(div_fp64(a, TWO_PI_FP64));
+    vec2 r = sub_fp64(a, mul_fp64(TWO_PI_FP64, z));
+
+    vec2 t;
+    float q = floor(r.x / PI_2_FP64.x + 0.5);
+    int j = int(q);
+
+
+    if (j < -2 || j > 2) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    }
+
+    t = sub_fp64(r, mul_fp64(PI_2_FP64, vec2(q, 0.0)));
+
+    q = floor(t.x / PI_16_FP64.x + 0.5);
+    int k = int(q);
+    int abs_k = int(abs(float(k)));
+
+    // We just can't get PI/16 * 3.0 very accurately.
+    // so let's just store it
+    if (abs_k > 4) {
+        return vec2(0.0 / 0.0, 0.0 / 0.0);
+    } else {
+        t = sub_fp64(t, mul_fp64(PI_16_FP64, vec2(q, 0.0)));
+    }
+
+
+    vec2 u = vec2(0.0, 0.0);
+    vec2 v = vec2(0.0, 0.0);
+
+    vec2 sin_t, cos_t;
+    vec2 s, c;
+    sincos_taylor_fp64(t, sin_t, cos_t);
+
+    if (k == 0) {
+        s = sin_t;
+        c = cos_t;
+    } else {
+#if defined(NVIDIA_FP64_WORKAROUND) || defined(INTEL_FP64_WORKAROUND)
+        if (abs(float(abs_k) - 1.0) < 0.5) {
+            u = COS_TABLE_0_FP64;
+            v = SIN_TABLE_0_FP64;
+        } else if (abs(float(abs_k) - 2.0) < 0.5) {
+            u = COS_TABLE_1_FP64;
+            v = SIN_TABLE_1_FP64;
+        } else if (abs(float(abs_k) - 3.0) < 0.5) {
+            u = COS_TABLE_2_FP64;
+            v = SIN_TABLE_2_FP64;
+        } else if (abs(float(abs_k) - 4.0) < 0.5) {
+            u = COS_TABLE_3_FP64;
+            v = SIN_TABLE_3_FP64;
+        }
+#else
+        if (abs_k == 1) {
+            u = COS_TABLE_0_FP64;
+            v = SIN_TABLE_0_FP64;
+        } else if (abs_k == 2) {
+            u = COS_TABLE_1_FP64;
+            v = SIN_TABLE_1_FP64;
+        } else if (abs_k == 3) {
+            u = COS_TABLE_2_FP64;
+            v = SIN_TABLE_2_FP64;
+        } else if (abs_k == 4) {
+            u = COS_TABLE_3_FP64;
+            v = SIN_TABLE_3_FP64;
+        }
+#endif
+        if (k > 0) {
+            s = sum_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+            c = sub_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        } else {
+            s = sub_fp64(mul_fp64(u, sin_t), mul_fp64(v, cos_t));
+            c = sum_fp64(mul_fp64(u, cos_t), mul_fp64(v, sin_t));
+        }
+    }
+
+    if (j == 0) {
+        sin_a = s;
+        cos_a = c;
+    } else if (j == 1) {
+        sin_a = c;
+        cos_a = -s;
+    } else if (j == -1) {
+        sin_a = -c;
+        cos_a = s;
+    } else {
+        sin_a = -s;
+        cos_a = -c;
+    }
+    return div_fp64(sin_a, cos_a);
+}
+
+vec2 radians_fp64(vec2 degree) {
+  return mul_fp64(degree, PI_180_FP64);
+}
+
+vec2 mix_fp64(vec2 a, vec2 b, float x) {
+  vec2 range = sub_fp64(b, a);
+  return sum_fp64(a, mul_fp64(range, vec2(x, 0.0)));
+}
+
+// Vector functions
+// vec2 functions
+void vec2_sum_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) {
+    out_val[0] = sum_fp64(a[0], b[0]);
+    out_val[1] = sum_fp64(a[1], b[1]);
+}
+
+void vec2_sub_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) {
+    out_val[0] = sub_fp64(a[0], b[0]);
+    out_val[1] = sub_fp64(a[1], b[1]);
+}
+
+void vec2_mul_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) {
+    out_val[0] = mul_fp64(a[0], b[0]);
+    out_val[1] = mul_fp64(a[1], b[1]);
+}
+
+void vec2_div_fp64(vec2 a[2], vec2 b[2], out vec2 out_val[2]) {
+    out_val[0] = div_fp64(a[0], b[0]);
+    out_val[1] = div_fp64(a[1], b[1]);
+}
+
+void vec2_mix_fp64(vec2 x[2], vec2 y[2], float a, out vec2 out_val[2]) {
+  vec2 range[2];
+  vec2_sub_fp64(y, x, range);
+  vec2 portion[2];
+  portion[0] = range[0] * a;
+  portion[1] = range[1] * a;
+  vec2_sum_fp64(x, portion, out_val);
+}
+
+vec2 vec2_length_fp64(vec2 x[2]) {
+  return sqrt_fp64(sum_fp64(mul_fp64(x[0], x[0]), mul_fp64(x[1], x[1])));
+}
+
+void vec2_normalize_fp64(vec2 x[2], out vec2 out_val[2]) {
+  vec2 length = vec2_length_fp64(x);
+  vec2 length_vec2[2];
+  length_vec2[0] = length;
+  length_vec2[1] = length;
+
+  vec2_div_fp64(x, length_vec2, out_val);
+}
+
+vec2 vec2_distance_fp64(vec2 x[2], vec2 y[2]) {
+  vec2 diff[2];
+  vec2_sub_fp64(x, y, diff);
+  return vec2_length_fp64(diff);
+}
+
+vec2 vec2_dot_fp64(vec2 a[2], vec2 b[2]) {
+  vec2 v[2];
+
+  v[0] = mul_fp64(a[0], b[0]);
+  v[1] = mul_fp64(a[1], b[1]);
+
+  return sum_fp64(v[0], v[1]);
+}
+
+// vec3 functions
+void vec3_sub_fp64(vec2 a[3], vec2 b[3], out vec2 out_val[3]) {
+  for (int i = 0; i < 3; i++) {
+    out_val[i] = sum_fp64(a[i], b[i]);
+  }
+}
+
+void vec3_sum_fp64(vec2 a[3], vec2 b[3], out vec2 out_val[3]) {
+  for (int i = 0; i < 3; i++) {
+    out_val[i] = sum_fp64(a[i], b[i]);
+  }
+}
+
+vec2 vec3_length_fp64(vec2 x[3]) {
+  return sqrt_fp64(sum_fp64(sum_fp64(mul_fp64(x[0], x[0]), mul_fp64(x[1], x[1])),
+    mul_fp64(x[2], x[2])));
+}
+
+vec2 vec3_distance_fp64(vec2 x[3], vec2 y[3]) {
+  vec2 diff[3];
+  vec3_sub_fp64(x, y, diff);
+  return vec3_length_fp64(diff);
+}
+
+// vec4 functions
+void vec4_fp64(vec4 a, out vec2 out_val[4]) {
+  out_val[0].x = a[0];
+  out_val[0].y = 0.0;
+
+  out_val[1].x = a[1];
+  out_val[1].y = 0.0;
+
+  out_val[2].x = a[2];
+  out_val[2].y = 0.0;
+
+  out_val[3].x = a[3];
+  out_val[3].y = 0.0;
+}
+
+void vec4_scalar_mul_fp64(vec2 a[4], vec2 b, out vec2 out_val[4]) {
+  out_val[0] = mul_fp64(a[0], b);
+  out_val[1] = mul_fp64(a[1], b);
+  out_val[2] = mul_fp64(a[2], b);
+  out_val[3] = mul_fp64(a[3], b);
+}
+
+void vec4_sum_fp64(vec2 a[4], vec2 b[4], out vec2 out_val[4]) {
+  for (int i = 0; i < 4; i++) {
+    out_val[i] = sum_fp64(a[i], b[i]);
+  }
+}
+
+void vec4_dot_fp64(vec2 a[4], vec2 b[4], out vec2 out_val) {
+  vec2 v[4];
+
+  v[0] = mul_fp64(a[0], b[0]);
+  v[1] = mul_fp64(a[1], b[1]);
+  v[2] = mul_fp64(a[2], b[2]);
+  v[3] = mul_fp64(a[3], b[3]);
+
+  out_val = sum_fp64(sum_fp64(v[0], v[1]), sum_fp64(v[2], v[3]));
+}
+
+void mat4_vec4_mul_fp64(vec2 b[16], vec2 a[4], out vec2 out_val[4]) {
+  vec2 tmp[4];
+
+  for (int i = 0; i < 4; i++)
+  {
+    for (int j = 0; j < 4; j++)
+    {
+      tmp[j] = b[j + i * 4];
+    }
+    vec4_dot_fp64(a, tmp, out_val[i]);
+  }
+}
+`;
diff --git a/modules/shadertools/src/modules/math/fp64/fp64.ts b/modules/shadertools/src/modules/math/fp64/fp64.ts
new file mode 100644
index 0000000000..43ceefa903
--- /dev/null
+++ b/modules/shadertools/src/modules/math/fp64/fp64.ts
@@ -0,0 +1,55 @@
+// luma.gl
+// SPDX-License-Identifier: MIT
+// Copyright (c) vis.gl contributors
+
+import {ShaderModule} from '../../../lib/shader-module/shader-module';
+
+import {fp64ify, fp64LowPart, fp64ifyMatrix4} from '../../../modules/math/fp64/fp64-utils';
+import {fp64arithmeticShader} from './fp64-arithmetic-glsl';
+import {fp64functionShader} from './fp64-functions-glsl';
+
+type FP64Props = {};
+type FP64Uniforms = {ONE: number};
+type FP64Bindings = {};
+
+type FP64Utilities = {
+  fp64ify: typeof fp64ify;
+  fp64LowPart: typeof fp64LowPart;
+  fp64ifyMatrix4: typeof fp64ifyMatrix4;
+};
+
+const defaultUniforms: FP64Uniforms = {
+  // Used in LUMA_FP64_CODE_ELIMINATION_WORKAROUND
+  ONE: 1.0
+};
+
+/**
+ * 64bit arithmetic: add, sub, mul, div (small subset of fp64 module)
+ */
+export const fp64arithmetic: ShaderModule<FP64Props, FP64Uniforms, FP64Bindings> & FP64Utilities = {
+  name: 'fp64arithmetic',
+  vs: fp64arithmeticShader,
+  defaultUniforms,
+  uniformTypes: {ONE: 'f32'},
+
+  // Additional Functions
+  fp64ify,
+  fp64LowPart,
+  fp64ifyMatrix4
+};
+
+/**
+ * Full 64 bit math library
+ */
+export const fp64: ShaderModule<{}> & FP64Utilities = {
+  name: 'fp64',
+  vs: fp64functionShader,
+  dependencies: [fp64arithmetic],
+
+  // Additional Functions
+  fp64ify,
+  fp64LowPart,
+  fp64ifyMatrix4
+};
+
+export {fp64ify, fp64LowPart, fp64ifyMatrix4};
diff --git a/modules/shadertools/test/index.ts b/modules/shadertools/test/index.ts
index ff16c33703..fc1ddbcda0 100644
--- a/modules/shadertools/test/index.ts
+++ b/modules/shadertools/test/index.ts
@@ -31,6 +31,7 @@ import './lib/shader-assembler.spec';
 
 // Data utilities
 import './modules/math/fp16-utils.spec';
+import './modules/math/fp64-arithmetic-transform.spec';
 import './modules/math/fp64-utils.spec';
 
 // General modules tests
diff --git a/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts b/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts
new file mode 100644
index 0000000000..530ce1892d
--- /dev/null
+++ b/modules/shadertools/test/modules/math/fp64-arithmetic-transform.spec.ts
@@ -0,0 +1,155 @@
+// luma.gl
+// SPDX-License-Identifier: MIT
+// Copyright (c) vis.gl contributors
+
+import test from 'tape-promise/tape';
+import {getWebGLTestDevice} from '@luma.gl/test-utils';
+import {runTests} from './fp64-test-utils-transform';
+
+// Failing test cases are ignored based on gpu and glslFunc, using ignoreFor field
+// ignoreFor: [{gpu: ['glslFunc-1', 'glslFunc-2']}] => ignores for `'glslFunc-1' and 'glslFunc-2` when running on `gpu`
+// Many of these tests fail on Apple GPUs with very large margins, see https://github.com/visgl/luma.gl/issues/1764.
+const commonTestCases = [
+  {a: 2, b: 2},
+  {a: 0.1, b: 0.1, ignoreFor: {apple: ['sum_fp64', 'mul_fp64', 'div_fp64']}},
+  {a: 3.0e-19, b: 3.3e13, ignoreFor: {apple: ['sum_fp64', 'sub_fp64']}},
+  {a: 9.9e-40, b: 1.7e3, ignoreFor: {}},
+  {a: 1.5e-36, b: 1.7e-16, ignoreFor: {}},
+  {a: 9.4e-26, b: 51, ignoreFor: {}},
+  {a: 6.7e-20, b: 0.93, ignoreFor: {apple: ['sum_fp64', 'sub_fp64']}},
+
+  // mul_fp64: Large numbers once multipled, can't be represented by 32 bit precision and Math.fround() returns NAN
+  // sqrt_fp64: Fail on INTEL with margin 3.906051071870294e-12
+  {
+    a: 2.4e3,
+    b: 5.9e31,
+    ignoreFor: {all: ['mul_fp64'], intel: ['sqrt_fp64'], apple: ['sum_fp64', 'sub_fp64']}
+  },
+
+  // div_fp64 fails on INTEL with margin 1.7318642528355118e-12
+  // sqrt_fp64 fails on INTEL with margin 1.5518878351528786e-12
+  {
+    a: 1.4e9,
+    b: 6.3e5,
+    ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['mul_fp64', 'div_fp64']}
+  },
+
+  // div fails on INTEL with margin 1.7886288892678105e-14
+  // sqrt fails on INTEL with margin 2.5362810256331708e-12
+  {a: 3.0e9, b: 4.3e-23, ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']}},
+
+  // div fail on INTEL with margin 1.137354350370519e-12
+  {a: 1.7e-19, b: 2.7e-27, ignoreFor: {intel: ['div_fp64'], apple: ['div_fp64']}},
+
+  // div_fp64 fails on INTEL with margin 2.7291999999999997e-12
+  // sqrt_fp64 fails on INTEL with margin 3.501857471494295e-12
+  {a: 0.3, b: 3.2e-16, ignoreFor: {intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']}},
+
+  // mul_fp64 : fails since result can't be represented by 32 bit floats
+  // div_fp64 : fails on INTEL with margin 1.9999999999999994e-15
+  // sqrt_fp64 : fails on INTEL with margin 1.832115697751484e-12
+  {
+    a: 4.1e30,
+    b: 8.2e15,
+    ignoreFor: {all: ['mul_fp64'], intel: ['div_fp64', 'sqrt_fp64'], apple: ['div_fp64']}
+  },
+
+  // Fails on INTEL, margin 3.752606081210107e-12
+  {
+    a: 6.2e3,
+    b: 6.3e10,
+    ignoreFor: {intel: ['sqrt_fp64'], apple: ['sum_fp64', 'mul_fp64', 'sub_fp64']}
+  },
+  // Fails on INTEL, margin 3.872578286363912e-13
+  {a: 2.5e2, b: 5.1e-21, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}},
+  // Fails on INTEL, margin 1.5332142001740705e-12
+  {a: 96, b: 1.7e4, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}},
+  // // Fail on INTEL, margin 1.593162047558726e-12
+  {
+    a: 0.27,
+    b: 2.3e16,
+    ignoreFor: {intel: ['sqrt_fp64'], apple: ['sum_fp64', 'mul_fp64', 'sub_fp64']}
+  },
+  // Fails on INTEL, margin 1.014956357028767e-12
+  {a: 18, b: 9.1e-9, ignoreFor: {intel: ['sqrt_fp64'], apple: ['div_fp64']}}
+];
+
+// Filter all tests cases based on current gpu and glsFunc
+function getTestCasesFor(device, glslFunc) {
+  const debugInfo = device.info;
+  const testCases = commonTestCases.filter(testCase => {
+    if (testCase.ignoreFor) {
+      for (const gpu in testCase.ignoreFor) {
+        if (
+          (gpu === 'all' || debugInfo.vendor.toLowerCase().indexOf(gpu) >= 0) &&
+          testCase.ignoreFor[gpu].includes(glslFunc)
+        ) {
+          return false;
+        }
+      }
+    }
+    return true;
+  });
+  return testCases;
+}
+
+test('fp64#sum_fp64', async t => {
+  console.log('start');
+  const webglDevice = await getWebGLTestDevice();
+  console.log('device');
+
+  const glslFunc = 'sum_fp64';
+  const testCases = getTestCasesFor(webglDevice, glslFunc);
+  await runTests(webglDevice, {glslFunc, binary: true, op: (a, b) => a + b, testCases, t});
+  t.end();
+});
+
+test('fp64#sub_fp64', async t => {
+  const webglDevice = await getWebGLTestDevice();
+
+  const glslFunc = 'sub_fp64';
+  const testCases = getTestCasesFor(webglDevice, glslFunc);
+  await runTests(webglDevice, {glslFunc, binary: true, op: (a, b) => a - b, testCases, t});
+  t.end();
+});
+
+test('fp64#mul_fp64', async t => {
+  const webglDevice = await getWebGLTestDevice();
+
+  const glslFunc = 'mul_fp64';
+  const testCases = getTestCasesFor(webglDevice, glslFunc);
+  await runTests(webglDevice, {
+    glslFunc,
+    binary: true,
+    op: (a, b) => a * b,
+    limit: 128,
+    testCases,
+    t
+  });
+  t.end();
+});
+
+test('fp64#div_fp64', async t => {
+  const webglDevice = await getWebGLTestDevice();
+
+  const glslFunc = 'div_fp64';
+  const testCases = getTestCasesFor(webglDevice, glslFunc);
+  await runTests(webglDevice, {
+    glslFunc,
+    binary: true,
+    op: (a, b) => a / b,
+    limit: 128,
+    testCases,
+    t
+  });
+  t.end();
+});
+
+test('fp64#sqrt_fp64', async t => {
+  const webglDevice = await getWebGLTestDevice();
+
+  const glslFunc = 'sqrt_fp64';
+  const testCases = getTestCasesFor(webglDevice, glslFunc);
+  await runTests(webglDevice, {glslFunc, op: a => Math.sqrt(a), limit: 128, testCases, t});
+  t.end();
+});
diff --git a/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts b/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts
new file mode 100644
index 0000000000..5f778cf2bd
--- /dev/null
+++ b/modules/shadertools/test/modules/math/fp64-test-utils-transform.ts
@@ -0,0 +1,130 @@
+// luma.gl
+// SPDX-License-Identifier: MIT
+// Copyright (c) vis.gl contributors
+
+// Special utility functions for df64 tests
+
+/* eslint-disable camelcase, prefer-template, max-len */
+
+import {Device} from '@luma.gl/core';
+import {BufferTransform} from '@luma.gl/engine';
+import {fp64} from '@luma.gl/shadertools';
+import {equals, config} from '@math.gl/core';
+const {fp64ify} = fp64;
+
+// Use 'invariant' specifier to work around some issues on Apple GPUs. The
+// specifier may or may not have an effect, depending on the browser and the
+// ANGLE backend, but it's an improvement when it's supported.
+// See: https://github.com/visgl/luma.gl/issues/1764
+
+function getBinaryShader(operation: string): string {
+  const shader = `\
+#version 300 es
+in vec2 a;
+in vec2 b;
+invariant out vec2 result;
+void main(void) {
+  result = ${operation}(a, b);
+}
+`;
+  return shader;
+}
+
+function getUnaryShader(operation: string): string {
+  return `\
+#version 300 es
+in vec2 a;
+in vec2 b;
+invariant out vec2 result;
+void main(void) {
+  result = ${operation}(a);
+}
+`;
+}
+
+config.EPSILON = 1e-11;
+
+function setupFloatData({limit, op, testCases}) {
+  const count = testCases.length;
+  const a_fp64 = new Float32Array(count * 2);
+  const b_fp64 = new Float32Array(count * 2);
+  const expected_fp64 = new Float32Array(count * 2);
+  const a = new Array(count);
+  const b = new Array(count);
+  const expected = new Array(count);
+  for (let idx = 0; idx < count; idx++) {
+    const index = idx * 2;
+    a[idx] = testCases[idx].a;
+    b[idx] = testCases[idx].b;
+    expected[idx] = op(a[idx], b[idx]);
+
+    fp64ify(a[idx], a_fp64, index);
+    fp64ify(b[idx], b_fp64, index);
+    fp64ify(expected[idx], expected_fp64, index);
+  }
+  return {a, b, expected, a_fp64, b_fp64, expected_fp64};
+}
+
+function setupFloatTest(device: Device, {glslFunc, binary = false, limit = 256, op, testCases}) {
+  const {a, b, expected, a_fp64, b_fp64, expected_fp64} = setupFloatData({limit, op, testCases});
+  const vs = binary ? getBinaryShader(glslFunc) : getUnaryShader(glslFunc);
+  const bufferA = device.createBuffer({data: a_fp64});
+  const bufferB = device.createBuffer({data: b_fp64});
+  const bufferResult = device.createBuffer({byteLength: a_fp64.byteLength});
+  const transform = new BufferTransform(device, {
+    vs,
+    modules: [fp64],
+    attributes: {a: bufferA, b: bufferB},
+    bufferLayout: [
+      {name: 'a', format: 'float32x2'},
+      {name: 'b', format: 'float32x2'}
+    ],
+    feedbackBuffers: {result: bufferResult},
+    outputs: ['result'],
+    vertexCount: testCases.length
+  });
+  return {a, b, expected, a_fp64, b_fp64, expected_fp64, transform};
+}
+
+export async function runTests(
+  device: Device,
+  {glslFunc, binary = false, op, limit = 256, testCases, t}
+) {
+  if (!BufferTransform.isSupported(device)) {
+    t.comment('Transform not supported, skipping tests');
+    t.end();
+    return;
+  }
+
+  const {a, b, a_fp64, b_fp64, expected_fp64, transform} = setupFloatTest(device, {
+    glslFunc,
+    binary,
+    op,
+    limit,
+    testCases
+  });
+
+  transform.run();
+
+  const {buffer, byteOffset, byteLength} = await transform.readAsync('result');
+  const gpuResult = new Float32Array(
+    buffer,
+    byteOffset,
+    byteLength / Float32Array.BYTES_PER_ELEMENT
+  );
+  for (let idx = 0; idx < testCases.length; idx++) {
+    const reference64 = expected_fp64[2 * idx] + expected_fp64[2 * idx + 1];
+    const result64 = gpuResult[2 * idx] + gpuResult[2 * idx + 1];
+
+    const args = binary
+      ? `(${a[idx].toPrecision(2)}, ${b[idx].toPrecision(2)})`
+      : `(${a[idx].toPrecision(2)})`;
+    const message = `${glslFunc}${args} error within tolerance`;
+    const isEqual = equals(reference64, result64);
+    t.ok(isEqual, message);
+    if (!isEqual) {
+      t.comment(` (tested ${a_fp64.toString()}, ${b_fp64.toString()})`);
+      t.comment(` (reference ${reference64}, result64 ${result64})`);
+    }
+  }
+}