Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Idx variants #160

Merged
merged 19 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libdistopia/include/distopia.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ namespace distopia {
template <typename T> void CalcSelfDistanceArrayNoBox(const T *a, int n, T *out);
template <typename T> void CalcSelfDistanceArrayOrtho(const T *a, int n, const T *box, T *out);
template <typename T> void CalcSelfDistanceArrayTriclinic(const T *a, int n, const T *box, T *out);
template <typename T> void CalcBondsNoBoxIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, int n, T *out);
template <typename T> void CalcBondsOrthoIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, int n, const T *box, T *out);
template <typename T> void CalcBondsTriclinicIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, int n, const T *box, T *out);
template <typename T> void CalcAnglesNoBoxIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, int n, T *out);
template <typename T> void CalcAnglesOrthoIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, int n, const T *box, T *out);
template <typename T> void CalcAnglesTriclinicIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, int n, const T *box, T *out);
int GetNFloatLanes();
int GetNDoubleLanes();
std::vector<std::string> DistopiaSupportedAndGeneratedTargets();
Expand Down
307 changes: 306 additions & 1 deletion libdistopia/src/distopia.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,169 @@ namespace distopia {
}
}

template <typename V>
HWY_INLINE void LoadInterleavedIdx(const unsigned int *idx, const float *src,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hmacdope this is the interesting function that does the IDX gather here.

In general, the width of the integer part of the GatherIndexneeds to match the width of the floating point type, so for the current signature of int indices, the float case is simple and the double case requires an upcase (see below).

If we wanted to have int64 indices in the signature for Idx functions, we'd probably need two more specialisations to this function, which isn't too much of a drama. The int64 and float case is a little odd as arguably the ints could overflow, so maybe you couldn't vectorise this as hard... (using current highway bindings at least).

V &x_dst, V &y_dst, V &z_dst) {
hn::ScalableTag<float> d;
hn::ScalableTag<int> d_int;
auto vidx = hn::LoadU(d_int, (int*)idx);
auto Vthree = hn::Set(d_int, 3);
auto Vone = hn::Set(d_int, 1);
// convert indices to 3D coordinate indices, i.e. index 10 at pointer 30 etc
vidx = hn::Mul(vidx, Vthree);
x_dst = hn::GatherIndex(d, src, vidx);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't actually that good, I imagine for performance you instead would want to do loads of xyz vectors (with junk) then a transpose, but you'd have to write this for all vector widths, at which point maybe you'd be better off doing a contribution upstream (GatherIndexInterleaved3D...) rather than writing so much code in the bullet farm

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah this is still clever, prepopulating the indices for a gather.

vidx = hn::Add(vidx, Vone);
y_dst = hn::GatherIndex(d, src, vidx);
vidx = hn::Add(vidx, Vone);
z_dst = hn::GatherIndex(d, src, vidx);
}

template <typename V>
HWY_INLINE void LoadInterleavedIdx(const unsigned int *idx, const double *src,
V &x_dst, V &y_dst, V &z_dst) {
hn::ScalableTag<double> d;
hn::ScalableTag<int64_t> d_long;
// "int64_t" number of lanes but "int" type
hn::Rebind<int, hn::ScalableTag<int64_t>> d_int;

// can't call Gather to doubles with int indices so: load int32s and cast them up to int64s so widths match
// first load NLONG values from idx
// !! important to not segfault here by loading LONG*2 (i.e. a full vector of 32s) !!
auto vidx_narrow = hn::LoadU(d_int, (int*)idx);
// then cast int32 values to int64, taking only lower half of vector
auto vidx = hn::PromoteTo(d_long, vidx_narrow);
auto Vthree = hn::Set(d_long, 3);
auto Vone = hn::Set(d_long, 1);

vidx = hn::Mul(vidx, Vthree);
x_dst = hn::GatherIndex(d, src, vidx);
vidx = hn::Add(vidx, Vone);
y_dst = hn::GatherIndex(d, src, vidx);
vidx = hn::Add(vidx, Vone);
z_dst = hn::GatherIndex(d, src, vidx);
}

template <typename T, typename B>
void CalcBondsIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, int n,
T *out, const B &box) {
const hn::ScalableTag<T> d;
int nlanes = hn::Lanes(d);

unsigned int a_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)];
unsigned int b_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)];
T out_sub[HWY_MAX_LANES_D(hn::ScalableTag<T>)];
T *dst;

if (HWY_UNLIKELY(n < nlanes)) {
// zero out idx arrays, so we can safely load 0th values when we spill
memset(a_sub, 0, sizeof(int) * nlanes);
memset(b_sub, 0, sizeof(int) * nlanes);
memcpy(a_sub, a_idx, sizeof(int) * n);
memcpy(b_sub, b_idx, sizeof(int) * n);
// switch to use a_sub as index array now we've copied contents
a_idx = a_sub;
b_idx = b_sub;
dst = out_sub;
} else {
dst = out;
}

auto a_x = hn::Undefined(d);
auto a_y = hn::Undefined(d);
auto a_z = hn::Undefined(d);
auto b_x = hn::Undefined(d);
auto b_y = hn::Undefined(d);
auto b_z = hn::Undefined(d);

for (size_t i=0; i <= n - nlanes; i += nlanes) {
// load N indices of each source
// interleaved gather these indices
LoadInterleavedIdx(a_idx + i, coords, a_x, a_y, a_z);
LoadInterleavedIdx(b_idx + i, coords, b_x, b_y, b_z);

auto result = box.Distance(a_x, a_y, a_z, b_x, b_y, b_z);

hn::StoreU(result, d, dst + i);
}
size_t rem = n % nlanes;
if (rem) { // if we had a non-multiple of nlanes, do final nlanes values again
LoadInterleavedIdx(a_idx + n - nlanes, coords, a_x, a_y, a_z);
LoadInterleavedIdx(b_idx + n - nlanes, coords, b_x, b_y, b_z);

auto result = box.Distance(a_x, a_y, a_z, b_x, b_y, b_z);

hn::StoreU(result, d, dst + n - nlanes);
}

if (HWY_UNLIKELY(n < nlanes)) {
// copy results back from temp array into actual output array
memcpy(out, out_sub, n * sizeof(T));
}
}

template <typename T, typename B>
void CalcAnglesIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, int n,
T *out, const B &box) {
// Logic here closely follows BondsIdx, except with an additional coordinate
const hn::ScalableTag<T> d;
int nlanes = hn::Lanes(d);

unsigned int a_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)];
unsigned int b_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)];
unsigned int c_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)];
T out_sub[HWY_MAX_LANES_D(hn::ScalableTag<T>)];
T *dst;

if (HWY_UNLIKELY(n < nlanes)) {
memset(a_sub, 0, sizeof(int) * nlanes);
memset(b_sub, 0, sizeof(int) * nlanes);
memset(c_sub, 0, sizeof(int) * nlanes);
memcpy(a_sub, a_idx, sizeof(int) * n);
memcpy(b_sub, b_idx, sizeof(int) * n);
memcpy(c_sub, c_idx, sizeof(int) * n);
a_idx = a_sub;
b_idx = b_sub;
c_idx = c_sub;
dst = out_sub;
} else {
dst = out;
}

auto a_x = hn::Undefined(d);
auto a_y = hn::Undefined(d);
auto a_z = hn::Undefined(d);
auto b_x = hn::Undefined(d);
auto b_y = hn::Undefined(d);
auto b_z = hn::Undefined(d);
auto c_x = hn::Undefined(d);
auto c_y = hn::Undefined(d);
auto c_z = hn::Undefined(d);

for (size_t i=0; i <= n - nlanes; i += nlanes) {
LoadInterleavedIdx(a_idx + i, coords, a_x, a_y, a_z);
LoadInterleavedIdx(b_idx + i, coords, b_x, b_y, b_z);
LoadInterleavedIdx(c_idx + i, coords, c_x, c_y, c_z);

auto result = Angle(a_x, a_y, a_z, b_x, b_y, b_z, c_x, c_y, c_z, box);

hn::StoreU(result, d, dst + i);
}
size_t rem = n % nlanes;
if (rem) {
LoadInterleavedIdx(a_idx + n - nlanes, coords, a_x, a_y, a_z);
LoadInterleavedIdx(b_idx + n - nlanes, coords, b_x, b_y, b_z);
LoadInterleavedIdx(c_idx + n - nlanes, coords, c_x, c_y, c_z);

auto result = Angle(a_x, a_y, a_z, b_x, b_y, b_z, c_x, c_y, c_z, box);

hn::StoreU(result, d, dst + n - nlanes);
}

if (HWY_UNLIKELY(n < nlanes)) {
memcpy(out, out_sub, n * sizeof(T));
}
}

void CalcBondsNoBoxDouble(const double *a, const double *b, int n, double *out) {
hn::ScalableTag<double> d;
const NoBox vbox(d);
Expand Down Expand Up @@ -852,6 +1015,88 @@ namespace distopia {
const TriclinicBox vbox(d, box);
CalcSelfDistanceArray(a, n, out, vbox);
}
void CalcBondsNoBoxIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, float *out) {
hn::ScalableTag<float> d;
const NoBox box(d);
CalcBondsIdx(coords, a_idx, b_idx, n, out, box);
}
void CalcBondsNoBoxIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, double *out) {
hn::ScalableTag<double> d;
const NoBox box(d);
CalcBondsIdx(coords, a_idx, b_idx, n, out, box);
}
void CalcBondsOrthoIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const float *box, float *out) {
hn::ScalableTag<float> d;
const OrthogonalBox vbox(d, box);

CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox);
}
void CalcBondsOrthoIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const double *box, double *out) {
hn::ScalableTag<double> d;
const OrthogonalBox vbox(d, box);

CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox);
}
void CalcBondsTriclinicIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const float *box, float *out) {
hn::ScalableTag<float> d;
const TriclinicBox vbox(d, box);

CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox);
}
void CalcBondsTriclinicIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const double *box, double *out) {
hn::ScalableTag<double> d;
const TriclinicBox vbox(d, box);

CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox);
}
void CalcAnglesNoBoxIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, float *out) {
hn::ScalableTag<float> d;
const NoBox vbox(d);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}
void CalcAnglesNoBoxIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, double *out) {
hn::ScalableTag<double> d;
const NoBox vbox(d);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}
void CalcAnglesOrthoIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const float *box, float *out) {
hn::ScalableTag<float> d;
const OrthogonalBox vbox(d, box);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}
void CalcAnglesOrthoIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const double *box, double *out) {
hn::ScalableTag<double> d;
const OrthogonalBox vbox(d, box);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}
void CalcAnglesTriclinicIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const float *box, float *out) {
hn::ScalableTag<float> d;
const TriclinicBox vbox(d, box);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}
void CalcAnglesTriclinicIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const double *box, double *out) {
hn::ScalableTag<double> d;
const TriclinicBox vbox(d, box);

CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox);
}

int GetNFloatLanes() {
hn::ScalableTag<float> d;
Expand Down Expand Up @@ -901,6 +1146,18 @@ namespace distopia {
HWY_EXPORT(CalcSelfDistanceArrayOrthoSingle);
HWY_EXPORT(CalcSelfDistanceArrayTriclinicDouble);
HWY_EXPORT(CalcSelfDistanceArrayTriclinicSingle);
HWY_EXPORT(CalcBondsNoBoxIdxSingle);
HWY_EXPORT(CalcBondsNoBoxIdxDouble);
HWY_EXPORT(CalcBondsOrthoIdxSingle);
HWY_EXPORT(CalcBondsOrthoIdxDouble);
HWY_EXPORT(CalcBondsTriclinicIdxSingle);
HWY_EXPORT(CalcBondsTriclinicIdxDouble);
HWY_EXPORT(CalcAnglesNoBoxIdxSingle);
HWY_EXPORT(CalcAnglesNoBoxIdxDouble);
HWY_EXPORT(CalcAnglesOrthoIdxSingle);
HWY_EXPORT(CalcAnglesOrthoIdxDouble);
HWY_EXPORT(CalcAnglesTriclinicIdxSingle);
HWY_EXPORT(CalcAnglesTriclinicIdxDouble);
HWY_EXPORT(GetNFloatLanes);
HWY_EXPORT(GetNDoubleLanes);

Expand Down Expand Up @@ -1038,8 +1295,56 @@ namespace distopia {
}
return HWY_DYNAMIC_DISPATCH(CalcSelfDistanceArrayTriclinicDouble)(a, n, box, out);
}
HWY_DLLEXPORT template <> void CalcBondsNoBoxIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsNoBoxIdxSingle)(coords, a_idx, b_idx, n, out);
}
HWY_DLLEXPORT template <> void CalcBondsNoBoxIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsNoBoxIdxDouble)(coords, a_idx, b_idx, n, out);
}
HWY_DLLEXPORT template <> void CalcBondsOrthoIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const float *box, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsOrthoIdxSingle)(coords, a_idx, b_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcBondsOrthoIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const double *box, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsOrthoIdxDouble)(coords, a_idx, b_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcBondsTriclinicIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const float *box, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsTriclinicIdxSingle)(coords, a_idx, b_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcBondsTriclinicIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx,
int n, const double *box, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcBondsTriclinicIdxDouble)(coords, a_idx, b_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcAnglesNoBoxIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesNoBoxIdxSingle)(coords, a_idx, b_idx, c_idx, n, out);
}
HWY_DLLEXPORT template <> void CalcAnglesNoBoxIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesNoBoxIdxDouble)(coords, a_idx, b_idx, c_idx, n, out);
}
HWY_DLLEXPORT template <> void CalcAnglesOrthoIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const float *box, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesOrthoIdxSingle)(coords, a_idx, b_idx, c_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcAnglesOrthoIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const double *box, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesOrthoIdxDouble)(coords, a_idx, b_idx, c_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcAnglesTriclinicIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const float *box, float *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesTriclinicIdxSingle)(coords, a_idx, b_idx, c_idx, n, box, out);
}
HWY_DLLEXPORT template <> void CalcAnglesTriclinicIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx,
int n, const double *box, double *out) {
return HWY_DYNAMIC_DISPATCH(CalcAnglesTriclinicIdxDouble)(coords, a_idx, b_idx, c_idx, n, box, out);
}

std::vector<std::string> DistopiaSupportedAndGeneratedTargets() {
std::vector<std::string> DistopiaSupportedAndGeneratedTargets() {
std::vector<int64_t> targets = hwy::SupportedAndGeneratedTargets();
// for each print the name
std::vector<std::string> names;
Expand Down
Loading
Loading