Add CUDA GPU JIT compilation to faddeeva() (tardis-sn#107)

* Add cuda import * Add cuda boilerplate for faddeeva * Add square root of pi as constant to scope of entire file * Add faddeeva_cuda test * Vectorize faddeeva * Refactor faddeeva to be vectorized and work with cuda * Clean up faddeeva_cuda test * Rename variables to keep with convention * Refactor faddeeva_gpu and include associated tests * Add functionality for more datatypes for faddeeva_cuda Also add associated tests * Clean up faddeeva_gpu tests by testing numpy and cuda array types separately * Optimize faddeeva function to be branchless * Use cupy arrays instead of numba.cuda arrays Also call faddeeva_cuda with prespecified numbers of blocks and threads * Only import cupy if the machine has GPUs available * Typecast input to complex * Size should be of output array * Return cupy array by default
smokestacklightnin · Aug 17, 2023 · 9c2da67 · 9c2da67
1 parent fce755a
commit 9c2da67
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 60 deletions.
diff --git a/stardis/opacities/tests/test_voigt.py b/stardis/opacities/tests/test_voigt.py
@@ -1,26 +1,111 @@
 import pytest
-from numpy import allclose, pi as PI
+import numpy as np
 from math import sqrt
+from numba import cuda
 
-from stardis.opacities.voigt import faddeeva, voigt_profile
+from stardis.opacities.voigt import (
+    faddeeva,
+    _faddeeva_cuda,
+    faddeeva_cuda,
+    voigt_profile,
+)
+
+# Test cases must also take into account use of a GPU to run. If there is no GPU then the test cases will fail.
+GPUs_available = cuda.is_available()
+
+if GPUs_available:
+    import cupy as cp
 
 
 @pytest.mark.parametrize(
     "faddeeva_sample_values_input, faddeeva_sample_values_expected_result",
     [
         (0, 1 + 0j),
         (0.0, 1.0 + 0.0j),
+        (np.array([0.0]), np.array([1.0 + 0.0j])),
+        (np.array([0, 0]), np.array([1 + 0j, 1 + 0j])),
     ],
 )
 def test_faddeeva_sample_values(
     faddeeva_sample_values_input, faddeeva_sample_values_expected_result
 ):
-    assert allclose(
+    assert np.allclose(
         faddeeva(faddeeva_sample_values_input),
         faddeeva_sample_values_expected_result,
     )
 
 
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "faddeeva_cuda_unwrapped_sample_values_input, faddeeva_cuda_unwrapped_sample_values_expected_result",
+    [
+        (np.array([0.0], dtype=complex), np.array([1.0 + 0.0j])),
+        (np.array([0, 0], dtype=complex), np.array([1 + 0j, 1 + 0j])),
+    ],
+)
+def test_faddeeva_cuda_unwrapped_sample_values(
+    faddeeva_cuda_unwrapped_sample_values_input,
+    faddeeva_cuda_unwrapped_sample_values_expected_result,
+):
+    test_values = cp.asarray(faddeeva_cuda_unwrapped_sample_values_input)
+    result_values = cp.empty_like(test_values)
+
+    nthreads = 256
+    length = len(faddeeva_cuda_unwrapped_sample_values_input)
+    nblocks = 1 + (length // nthreads)
+
+    _faddeeva_cuda[nblocks, nthreads](result_values, test_values)
+
+    assert np.allclose(
+        cp.asnumpy(result_values),
+        faddeeva_cuda_unwrapped_sample_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "faddeeva_cuda_wrapped_sample_numpy_values_input, faddeeva_cuda_wrapped_sample_numpy_values_expected_result",
+    [
+        (np.array([0.0]), np.array([1.0 + 0.0j])),
+        (np.array([0, 0]), np.array([1 + 0j, 1 + 0j])),
+    ],
+)
+def test_faddeeva_cuda_wrapped_sample_numpy_values(
+    faddeeva_cuda_wrapped_sample_numpy_values_input,
+    faddeeva_cuda_wrapped_sample_numpy_values_expected_result,
+):
+    assert np.allclose(
+        faddeeva_cuda(faddeeva_cuda_wrapped_sample_numpy_values_input),
+        faddeeva_cuda_wrapped_sample_numpy_values_expected_result,
+    )
+
+
+@pytest.mark.skipif(
+    not GPUs_available, reason="No GPU is available to test CUDA function"
+)
+@pytest.mark.parametrize(
+    "faddeeva_cuda_wrapped_sample_cuda_values_input, faddeeva_cuda_wrapped_sample_cuda_values_expected_result",
+    [
+        (
+            np.array([0, 0], dtype=complex),
+            np.array([1 + 0j, 1 + 0j]),
+        ),
+    ],
+)
+def test_faddeeva_cuda_wrapped_sample_cuda_values(
+    faddeeva_cuda_wrapped_sample_cuda_values_input,
+    faddeeva_cuda_wrapped_sample_cuda_values_expected_result,
+):
+    assert np.allclose(
+        faddeeva_cuda(cp.asarray(faddeeva_cuda_wrapped_sample_cuda_values_input)),
+        faddeeva_cuda_wrapped_sample_cuda_values_expected_result,
+    )
+
+
 test_voigt_profile_division_by_zero_test_values = [
     -100,
     -5,
@@ -57,8 +142,8 @@ def test_voigt_profile_division_by_zero(
 @pytest.mark.parametrize(
     "voigt_profile_sample_values_input_delta_nu, voigt_profile_sample_values_input_doppler_width, voigt_profile_sample_values_input_gamma, voigt_profile_sample_values_expected_result",
     [
-        (0, 1, 0, 1 / sqrt(PI)),
-        (0, 2, 0, 1 / (sqrt(PI) * 2)),
+        (0, 1, 0, 1 / sqrt(np.pi)),
+        (0, 2, 0, 1 / (sqrt(np.pi) * 2)),
     ],
 )
 def test_voigt_profile_sample_values_sample_values(
@@ -67,7 +152,7 @@ def test_voigt_profile_sample_values_sample_values(
     voigt_profile_sample_values_input_gamma,
     voigt_profile_sample_values_expected_result,
 ):
-    assert allclose(
+    assert np.allclose(
         voigt_profile(
             voigt_profile_sample_values_input_delta_nu,
             voigt_profile_sample_values_input_doppler_width,

diff --git a/stardis/opacities/voigt.py b/stardis/opacities/voigt.py
@@ -1,9 +1,18 @@
 import numpy as np
 import numba
+from numba import cuda
+import cmath
 
+GPUs_available = cuda.is_available()
 
-@numba.njit
-def faddeeva(z):
+if GPUs_available:
+    import cupy as cp
+
+SQRT_PI = np.sqrt(np.pi, dtype=float)
+
+
+@numba.njit()
+def _faddeeva(z):
     """
     The Faddeeva function. Code adapted from
     https://github.com/tiagopereira/Transparency.jl/blob/966fb46c21/src/voigt.jl#L13.
@@ -16,63 +25,88 @@ def faddeeva(z):
     -------
     w : complex
     """
-    s = abs(np.real(z)) + np.imag(z)
-
-    if s > 15.0:
-        # region I
-        w = 1j * 1 / np.sqrt(np.pi) * z / (z**2 - 0.5)
-
-    elif s > 5.5:
-        # region II
-        w = (
-            1j
-            * (z * (z**2 * 1 / np.sqrt(np.pi) - 1.4104739589))
-            / (0.75 + z**2 * (z**2 - 3.0))
+    z = complex(z)
+    x = float(z.real)
+    y = float(z.imag)
+    t = y - 1j * x
+    s = abs(x) + y
+    w = complex(0.0)
+    u = t * t
+
+    IN_REGION_I = s > 15.0
+    IN_REGION_II = (not IN_REGION_I) and (s > 5.5)
+    IN_REGION_III = (
+        (not IN_REGION_I) and (not IN_REGION_II) and (y >= 0.195 * abs(x) - 0.176)
+    )
+    IN_REGION_IV = (not IN_REGION_I) and (not IN_REGION_II) and (not IN_REGION_III)
+
+    # If in Region I
+    w = 1j * 1 / SQRT_PI * z / (z**2 - 0.5) if IN_REGION_I else w
+
+    # If in Region II
+    w = (
+        1j
+        * (z * (z**2 * 1 / SQRT_PI - 1.4104739589))
+        / (0.75 + z**2 * (z**2 - 3.0))
+        if IN_REGION_II
+        else w
+    )
+
+    # If in Region III
+    w = (
+        (16.4955 + t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t))))
+        / (
+            16.4955
+            + t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t))))
         )
+        if IN_REGION_III
+        else w
+    )
 
-    else:
-        x = np.real(z)
-        y = np.imag(z)
-        t = y - 1j * x
-
-        if y >= 0.195 * abs(x) - 0.176:
-            # region III
-            w = (
-                16.4955
-                + t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t)))
-            ) / (
-                16.4955
-                + t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t))))
-            )
-
-        else:
-            # region IV
-            u = t * t
-            numerator = t * (
-                36183.31
-                - u
-                * (
-                    3321.99
-                    - u
-                    * (
-                        1540.787
-                        - u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419)))
-                    )
-                )
-            )
-            denominantor = 32066.6 - u * (
-                24322.8
-                - u
-                * (
-                    9022.23
-                    - u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u))))
-                )
-            )
-            w = np.exp(u) - numerator / denominantor
+    # If in Region IV
+    numerator = t * (
+        36183.31
+        - u
+        * (
+            3321.99
+            - u
+            * (1540.787 - u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419))))
+        )
+    )
+    denominator = 32066.6 - u * (
+        24322.8
+        - u
+        * (9022.23 - u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u)))))
+    )
+    w = (cmath.exp(u) - numerator / denominator) if IN_REGION_IV else w
 
     return w
 
 
+@numba.vectorize(nopython=True)
+def faddeeva(z):
+    return _faddeeva(z)
+
+
+@cuda.jit
+def _faddeeva_cuda(res, z):
+    tid = cuda.grid(1)
+    size = len(res)
+
+    if tid < size:
+        res[tid] = _faddeeva(z[tid])
+
+
+def faddeeva_cuda(z, nthreads=256, ret_np_ndarray=False):
+    size = len(z)
+    nblocks = 1 + (size // nthreads)
+    z = cp.asarray(z, dtype=complex)
+    res = cp.empty_like(z)
+
+    _faddeeva_cuda[nblocks, nthreads](res, z)
+    return cp.asnumpy(res) if ret_np_ndarray else res
+
+
 @numba.njit
 def voigt_profile(delta_nu, doppler_width, gamma):
     """
@@ -95,5 +129,5 @@ def voigt_profile(delta_nu, doppler_width, gamma):
         Value of Voigt profile.
     """
     z = (delta_nu + (gamma / (4 * np.pi)) * 1j) / doppler_width
-    phi = np.real(faddeeva(z)) / (np.sqrt(np.pi) * doppler_width)
+    phi = np.real(faddeeva(z)) / (SQRT_PI * doppler_width)
     return phi