Skip to content

Commit

Permalink
Add CUDA GPU JIT compilation to faddeeva() (tardis-sn#107)
Browse files Browse the repository at this point in the history
* Add cuda import

* Add cuda boilerplate for faddeeva

* Add square root of pi as constant to scope of entire file

* Add faddeeva_cuda test

* Vectorize faddeeva

* Refactor faddeeva to be vectorized and work with cuda

* Clean up faddeeva_cuda test

* Rename variables to keep with convention

* Refactor faddeeva_gpu and include associated tests

* Add functionality for more datatypes for faddeeva_cuda

Also add associated tests

* Clean up faddeeva_gpu tests by testing numpy and cuda array types separately

* Optimize faddeeva function to be branchless

* Use cupy arrays instead of numba.cuda arrays

Also call faddeeva_cuda with prespecified numbers of blocks and threads

* Only import cupy if the machine has GPUs available

* Typecast input to complex

* Size should be of output array

* Return cupy array by default
  • Loading branch information
smokestacklightnin committed Aug 17, 2023
1 parent fce755a commit 9c2da67
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 60 deletions.
97 changes: 91 additions & 6 deletions stardis/opacities/tests/test_voigt.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,111 @@
import pytest
from numpy import allclose, pi as PI
import numpy as np
from math import sqrt
from numba import cuda

from stardis.opacities.voigt import faddeeva, voigt_profile
from stardis.opacities.voigt import (
faddeeva,
_faddeeva_cuda,
faddeeva_cuda,
voigt_profile,
)

# Test cases must also take into account use of a GPU to run. If there is no GPU then the test cases will fail.
GPUs_available = cuda.is_available()

if GPUs_available:
import cupy as cp


@pytest.mark.parametrize(
"faddeeva_sample_values_input, faddeeva_sample_values_expected_result",
[
(0, 1 + 0j),
(0.0, 1.0 + 0.0j),
(np.array([0.0]), np.array([1.0 + 0.0j])),
(np.array([0, 0]), np.array([1 + 0j, 1 + 0j])),
],
)
def test_faddeeva_sample_values(
faddeeva_sample_values_input, faddeeva_sample_values_expected_result
):
assert allclose(
assert np.allclose(
faddeeva(faddeeva_sample_values_input),
faddeeva_sample_values_expected_result,
)


@pytest.mark.skipif(
not GPUs_available, reason="No GPU is available to test CUDA function"
)
@pytest.mark.parametrize(
"faddeeva_cuda_unwrapped_sample_values_input, faddeeva_cuda_unwrapped_sample_values_expected_result",
[
(np.array([0.0], dtype=complex), np.array([1.0 + 0.0j])),
(np.array([0, 0], dtype=complex), np.array([1 + 0j, 1 + 0j])),
],
)
def test_faddeeva_cuda_unwrapped_sample_values(
faddeeva_cuda_unwrapped_sample_values_input,
faddeeva_cuda_unwrapped_sample_values_expected_result,
):
test_values = cp.asarray(faddeeva_cuda_unwrapped_sample_values_input)
result_values = cp.empty_like(test_values)

nthreads = 256
length = len(faddeeva_cuda_unwrapped_sample_values_input)
nblocks = 1 + (length // nthreads)

_faddeeva_cuda[nblocks, nthreads](result_values, test_values)

assert np.allclose(
cp.asnumpy(result_values),
faddeeva_cuda_unwrapped_sample_values_expected_result,
)


@pytest.mark.skipif(
not GPUs_available, reason="No GPU is available to test CUDA function"
)
@pytest.mark.parametrize(
"faddeeva_cuda_wrapped_sample_numpy_values_input, faddeeva_cuda_wrapped_sample_numpy_values_expected_result",
[
(np.array([0.0]), np.array([1.0 + 0.0j])),
(np.array([0, 0]), np.array([1 + 0j, 1 + 0j])),
],
)
def test_faddeeva_cuda_wrapped_sample_numpy_values(
faddeeva_cuda_wrapped_sample_numpy_values_input,
faddeeva_cuda_wrapped_sample_numpy_values_expected_result,
):
assert np.allclose(
faddeeva_cuda(faddeeva_cuda_wrapped_sample_numpy_values_input),
faddeeva_cuda_wrapped_sample_numpy_values_expected_result,
)


@pytest.mark.skipif(
not GPUs_available, reason="No GPU is available to test CUDA function"
)
@pytest.mark.parametrize(
"faddeeva_cuda_wrapped_sample_cuda_values_input, faddeeva_cuda_wrapped_sample_cuda_values_expected_result",
[
(
np.array([0, 0], dtype=complex),
np.array([1 + 0j, 1 + 0j]),
),
],
)
def test_faddeeva_cuda_wrapped_sample_cuda_values(
faddeeva_cuda_wrapped_sample_cuda_values_input,
faddeeva_cuda_wrapped_sample_cuda_values_expected_result,
):
assert np.allclose(
faddeeva_cuda(cp.asarray(faddeeva_cuda_wrapped_sample_cuda_values_input)),
faddeeva_cuda_wrapped_sample_cuda_values_expected_result,
)


test_voigt_profile_division_by_zero_test_values = [
-100,
-5,
Expand Down Expand Up @@ -57,8 +142,8 @@ def test_voigt_profile_division_by_zero(
@pytest.mark.parametrize(
"voigt_profile_sample_values_input_delta_nu, voigt_profile_sample_values_input_doppler_width, voigt_profile_sample_values_input_gamma, voigt_profile_sample_values_expected_result",
[
(0, 1, 0, 1 / sqrt(PI)),
(0, 2, 0, 1 / (sqrt(PI) * 2)),
(0, 1, 0, 1 / sqrt(np.pi)),
(0, 2, 0, 1 / (sqrt(np.pi) * 2)),
],
)
def test_voigt_profile_sample_values_sample_values(
Expand All @@ -67,7 +152,7 @@ def test_voigt_profile_sample_values_sample_values(
voigt_profile_sample_values_input_gamma,
voigt_profile_sample_values_expected_result,
):
assert allclose(
assert np.allclose(
voigt_profile(
voigt_profile_sample_values_input_delta_nu,
voigt_profile_sample_values_input_doppler_width,
Expand Down
142 changes: 88 additions & 54 deletions stardis/opacities/voigt.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import numpy as np
import numba
from numba import cuda
import cmath

GPUs_available = cuda.is_available()

@numba.njit
def faddeeva(z):
if GPUs_available:
import cupy as cp

SQRT_PI = np.sqrt(np.pi, dtype=float)


@numba.njit()
def _faddeeva(z):
"""
The Faddeeva function. Code adapted from
https://github.com/tiagopereira/Transparency.jl/blob/966fb46c21/src/voigt.jl#L13.
Expand All @@ -16,63 +25,88 @@ def faddeeva(z):
-------
w : complex
"""
s = abs(np.real(z)) + np.imag(z)

if s > 15.0:
# region I
w = 1j * 1 / np.sqrt(np.pi) * z / (z**2 - 0.5)

elif s > 5.5:
# region II
w = (
1j
* (z * (z**2 * 1 / np.sqrt(np.pi) - 1.4104739589))
/ (0.75 + z**2 * (z**2 - 3.0))
z = complex(z)
x = float(z.real)
y = float(z.imag)
t = y - 1j * x
s = abs(x) + y
w = complex(0.0)
u = t * t

IN_REGION_I = s > 15.0
IN_REGION_II = (not IN_REGION_I) and (s > 5.5)
IN_REGION_III = (
(not IN_REGION_I) and (not IN_REGION_II) and (y >= 0.195 * abs(x) - 0.176)
)
IN_REGION_IV = (not IN_REGION_I) and (not IN_REGION_II) and (not IN_REGION_III)

# If in Region I
w = 1j * 1 / SQRT_PI * z / (z**2 - 0.5) if IN_REGION_I else w

# If in Region II
w = (
1j
* (z * (z**2 * 1 / SQRT_PI - 1.4104739589))
/ (0.75 + z**2 * (z**2 - 3.0))
if IN_REGION_II
else w
)

# If in Region III
w = (
(16.4955 + t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t))))
/ (
16.4955
+ t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t))))
)
if IN_REGION_III
else w
)

else:
x = np.real(z)
y = np.imag(z)
t = y - 1j * x

if y >= 0.195 * abs(x) - 0.176:
# region III
w = (
16.4955
+ t * (20.20933 + t * (11.96482 + t * (3.778987 + 0.5642236 * t)))
) / (
16.4955
+ t * (38.82363 + t * (39.27121 + t * (21.69274 + t * (6.699398 + t))))
)

else:
# region IV
u = t * t
numerator = t * (
36183.31
- u
* (
3321.99
- u
* (
1540.787
- u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419)))
)
)
)
denominantor = 32066.6 - u * (
24322.8
- u
* (
9022.23
- u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u))))
)
)
w = np.exp(u) - numerator / denominantor
# If in Region IV
numerator = t * (
36183.31
- u
* (
3321.99
- u
* (1540.787 - u * (219.031 - u * (35.7668 - u * (1.320522 - u * 0.56419))))
)
)
denominator = 32066.6 - u * (
24322.8
- u
* (9022.23 - u * (2186.18 - u * (364.219 - u * (61.5704 - u * (1.84144 - u)))))
)
w = (cmath.exp(u) - numerator / denominator) if IN_REGION_IV else w

return w


@numba.vectorize(nopython=True)
def faddeeva(z):
return _faddeeva(z)


@cuda.jit
def _faddeeva_cuda(res, z):
tid = cuda.grid(1)
size = len(res)

if tid < size:
res[tid] = _faddeeva(z[tid])


def faddeeva_cuda(z, nthreads=256, ret_np_ndarray=False):
size = len(z)
nblocks = 1 + (size // nthreads)
z = cp.asarray(z, dtype=complex)
res = cp.empty_like(z)

_faddeeva_cuda[nblocks, nthreads](res, z)
return cp.asnumpy(res) if ret_np_ndarray else res


@numba.njit
def voigt_profile(delta_nu, doppler_width, gamma):
"""
Expand All @@ -95,5 +129,5 @@ def voigt_profile(delta_nu, doppler_width, gamma):
Value of Voigt profile.
"""
z = (delta_nu + (gamma / (4 * np.pi)) * 1j) / doppler_width
phi = np.real(faddeeva(z)) / (np.sqrt(np.pi) * doppler_width)
phi = np.real(faddeeva(z)) / (SQRT_PI * doppler_width)
return phi

0 comments on commit 9c2da67

Please sign in to comment.