Skip to content
This repository has been archived by the owner on Sep 15, 2022. It is now read-only.

Commit

Permalink
Major rework on Keccak implementation which together with some other …
Browse files Browse the repository at this point in the history
…minor changes gives up to 4x performance boost on nVidia cards
  • Loading branch information
johguse committed Jul 17, 2019
1 parent ef33b01 commit a268e49
Show file tree
Hide file tree
Showing 10 changed files with 877 additions and 770 deletions.
10 changes: 5 additions & 5 deletions Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score
const std::string strPublic = toHex(r.foundHash, 20);

// Print
std::cout << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << ' ';
const std::string strVT100ClearLine = "\33[2K\r";
std::cout << strVT100ClearLine << " Time: " << std::setw(5) << seconds << "s Score: " << std::setw(2) << (int) score << " Private: 0x" << strPrivate << ' ';

std::cout << mode.transformName();
std::cout << ": 0x" << strPublic << std::endl;
Expand Down Expand Up @@ -116,8 +117,8 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr
m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")),
m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")),
m_kernelEnd(createKernel(clProgram, "profanity_end")),
m_kernelScore(createKernel(clProgram, mode.kernel)),
m_kernelTransform(createKernel(clProgram, mode.transformKernel())),
m_kernelScore(createKernel(clProgram, mode.kernel)),
m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp),
m_memPoints(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true),
Expand Down Expand Up @@ -181,7 +182,7 @@ void Dispatcher::run() {

void Dispatcher::init() {
std::cout << "Initializing devices..." << std::endl;
std::cout << " This can take a minute or two. The number of objects initialized on each" << std::endl;
std::cout << " This should take less than a minute. The number of objects initialized on each" << std::endl;
std::cout << " device is equal to inverse-size * inverse-multiple. To lower" << std::endl;
std::cout << " initialization time (and memory footprint) I suggest lowering the" << std::endl;
std::cout << " inverse-multiple first. You can do this via the -I switch. Do note that" << std::endl;
Expand Down Expand Up @@ -308,8 +309,7 @@ void Dispatcher::enqueueKernel(cl_command_queue & clQueue, cl_kernel & clKernel,
void Dispatcher::enqueueKernelDevice(Device & d, cl_kernel & clKernel, size_t worksizeGlobal, const bool bOneAtATime = false) {
try {
enqueueKernel(d.m_clQueue, clKernel, worksizeGlobal, d.m_worksizeLocal, bOneAtATime);
}
catch ( OpenCLException & e ) {
} catch ( OpenCLException & e ) {
// If local work size is invalid, abandon it and let implementation decide
if ((e.m_res == CL_INVALID_WORK_GROUP_SIZE || e.m_res == CL_INVALID_WORK_ITEM_SIZE) && d.m_worksizeLocal != 0) {
std::cout << std::endl << "warning: local work size abandoned on GPU" << d.m_index << std::endl;
Expand Down
7 changes: 7 additions & 0 deletions Mode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,10 @@ Mode Mode::mirror() {
r.kernel = "profanity_score_mirror";
return r;
}

Mode Mode::doubles() {
Mode r;
r.name = "doubles";
r.kernel = "profanity_score_doubles";
return r;
}
1 change: 1 addition & 0 deletions Mode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class Mode {
static Mode zeros();
static Mode letters();
static Mode numbers();
static Mode doubles();

std::string name;

Expand Down
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ usage: ./profanity [OPTIONS]
--letters Score on letters anywhere in hash.
--numbers Score on numbers anywhere in hash.
--mirror Score on mirroring from center.
--leading-doubles Score on hashes leading with hexadecimal pairs
Modes with arguments:
--leading <single hex> Score on hashes leading with given hex character.
Expand All @@ -38,15 +39,15 @@ usage: ./profanity [OPTIONS]
Device control:
-s, --skip <index> Skip device given by index.
-n, --no-cache Don't load cached pre-compiled version of kernel.
-n, --no-cache Don't load cached pre-compiled version of kernel.
Tweaking:
-w, --work <size> Set OpenCL local work size. [default = 64]
-W, --work-max <size> Set OpenCL maximum work size. [default = 1048576]
-W, --work-max <size> Set OpenCL maximum work size. [default = -i * -I]
-i, --inverse-size Set size of modular inverses to calculate in one
work item. [default = 256]
work item. [default = 255]
-I, --inverse-multiple Set how many above work items will run in
parallell. [default = 65536]
parallell. [default = 16384]
Examples:
./profanity --leading f
Expand All @@ -69,10 +70,12 @@ usage: ./profanity [OPTIONS]
|Model|Clock Speed|Memory Speed|Modified straps|Speed|Time to match eight characters|Version
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|RX VEGA56|1408|1100|YES|146 MH/s| ~29 s | 1.1x
|GTX 1070 OC|1950|4450|NO|120.0 MH/s| ~36s | 1.3x
|GTX 1070|1750|4000|NO|106.0 MH/s| ~41s | 1.3x
|R9 290|1150|1400|NO|100 MH/s| ~43 s | 1.1x
|RX 480|1328|2000|YES|97 MH/s| ~45 s| 1.2x
|RX 480|1266|2000|YES|92 MH/s| ~47 s| 1.2x
|RX 580|1366|1750|YES|92 MH/s| ~47 s| 1.2x
|R9 290|1040|1300|NO|91 MH/s| ~47 s | 1.1x
|RX 470|1216|1750|YES|73 MH/s| ~59s | 1.2x
|GTX 1070| - | - | NO | 26.0 MH/s | ~166s | 1.2x

15 changes: 10 additions & 5 deletions SpeedSample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@ SpeedSample::~SpeedSample() {
}

double SpeedSample::getSpeed() const {
double speed = 0;
for( auto & v : m_lSpeeds) {
speed += v / m_lSpeeds.size();
auto delta = std::chrono::duration_cast<std::chrono::milliseconds>(now() - m_lastTime).count();
if (delta > 5000) {
return 0;
} else {
double speed = 0;
for (auto & v : m_lSpeeds) {
speed += v / m_lSpeeds.size();
}

return speed;
}

return speed;
}

void SpeedSample::sample(const double V) {
Expand Down
16 changes: 0 additions & 16 deletions constants.hpp

This file was deleted.

7 changes: 4 additions & 3 deletions help.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ usage: ./profanity [OPTIONS]
--letters Score on letters anywhere in hash.
--numbers Score on numbers anywhere in hash.
--mirror Score on mirroring from center.
--leading-doubles Score on hashes leading with hexadecimal pairs
Modes with arguments:
--leading <single hex> Score on hashes leading with given hex character.
Expand All @@ -35,11 +36,11 @@ usage: ./profanity [OPTIONS]
Tweaking:
-w, --work <size> Set OpenCL local work size. [default = 64]
-W, --work-max <size> Set OpenCL maximum work size. [default = 1048576]
-W, --work-max <size> Set OpenCL maximum work size. [default = -i * -I]
-i, --inverse-size Set size of modular inverses to calculate in one
work item. [default = 256]
work item. [default = 255]
-I, --inverse-multiple Set how many above work items will run in
parallell. [default = 65536]
parallell. [default = 16384]
Examples:
./profanity --leading f
Expand Down
233 changes: 139 additions & 94 deletions keccak.cl
Original file line number Diff line number Diff line change
@@ -1,94 +1,139 @@
/* Original: https://github.com/mjosaarinen/tiny_sha3
* Below is a very slimmed down version of the already tiny SHA3
* implementation by Markku-Juhani O. Saarinen.
*
* The implementation below is for inputs of exactly 64 bytes
* that's pre-filled in ethhash.b[0] - ethhash.b[63].
*/

typedef union {
uchar b[200];
ulong q[25];
uint d[50];
} ethhash;

// constants
__constant ulong keccakf_rndc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};

__constant ulong keccakf_rotc[24] = {
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};

__constant int keccakf_piln[24] = {
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
};

/* Barely a bottleneck. No need to tinker more. */
void sha3_keccakf(ethhash * const pHash)
{
ulong * const st = &pHash->q;
pHash->d[33] ^= 0x80000000;

// variables
int i, j, r;
ulong t, bc[5];

// actual iteration
for (r = 0; r < 24; r++) {
// Theta - unrolled
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];

for (i = 0; i < 5; i++) {
t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], (ulong) 1);

st[i] ^= t;
st[i + 5] ^= t;
st[i + 10] ^= t;
st[i + 15] ^= t;
st[i + 20] ^= t;
}

// Rho Pi
t = st[1];
for (i = 0; i < 24; i++) {
j = keccakf_piln[i];
bc[0] = st[j];

st[j] = rotate(t, keccakf_rotc[i]);
t = bc[0];
}

// Chi
for (j = 0; j < 25; j += 5) {
bc[0] = st[j + 0];
bc[1] = st[j + 1];
bc[2] = st[j + 2];
bc[3] = st[j + 3];
bc[4] = st[j + 4];

st[j + 0] ^= (~bc[1]) & bc[2];
st[j + 1] ^= (~bc[2]) & bc[3];
st[j + 2] ^= (~bc[3]) & bc[4];
st[j + 3] ^= (~bc[4]) & bc[0];
st[j + 4] ^= (~bc[0]) & bc[1];
}

// Iota
st[0] ^= keccakf_rndc[r];
}
}
/* This Keccak implementation is an amalgamation of:
* Tiny SHA3 implementation by Markku-Juhani O. Saarinen:
* https://github.com/mjosaarinen/tiny_sha3
* Keccak implementation found in xptMiner-gpu @ Github:
* https://github.com/llamasoft/xptMiner-gpu/blob/master/opencl/keccak.cl
*/

typedef union {
uchar b[200];
ulong q[25];
uint d[50];
} ethhash;

#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) \
{ \
t = rotate((ulong)(d0 ^ d1 ^ d2 ^ d3 ^ d4), (ulong)1) ^ (c0 ^ c1 ^ c2 ^ c3 ^ c4); \
}

#define THETA(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
TH_ELT(t0, s40, s41, s42, s43, s44, s10, s11, s12, s13, s14); \
TH_ELT(t1, s00, s01, s02, s03, s04, s20, s21, s22, s23, s24); \
TH_ELT(t2, s10, s11, s12, s13, s14, s30, s31, s32, s33, s34); \
TH_ELT(t3, s20, s21, s22, s23, s24, s40, s41, s42, s43, s44); \
TH_ELT(t4, s30, s31, s32, s33, s34, s00, s01, s02, s03, s04); \
s00 ^= t0; s01 ^= t0; s02 ^= t0; s03 ^= t0; s04 ^= t0; \
s10 ^= t1; s11 ^= t1; s12 ^= t1; s13 ^= t1; s14 ^= t1; \
s20 ^= t2; s21 ^= t2; s22 ^= t2; s23 ^= t2; s24 ^= t2; \
s30 ^= t3; s31 ^= t3; s32 ^= t3; s33 ^= t3; s34 ^= t3; \
s40 ^= t4; s41 ^= t4; s42 ^= t4; s43 ^= t4; s44 ^= t4; \
}

#define RHOPI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = rotate(s10, (ulong) 1); \
s10 = rotate(s11, (ulong)44); \
s11 = rotate(s41, (ulong)20); \
s41 = rotate(s24, (ulong)61); \
s24 = rotate(s42, (ulong)39); \
s42 = rotate(s04, (ulong)18); \
s04 = rotate(s20, (ulong)62); \
s20 = rotate(s22, (ulong)43); \
s22 = rotate(s32, (ulong)25); \
s32 = rotate(s43, (ulong) 8); \
s43 = rotate(s34, (ulong)56); \
s34 = rotate(s03, (ulong)41); \
s03 = rotate(s40, (ulong)27); \
s40 = rotate(s44, (ulong)14); \
s44 = rotate(s14, (ulong) 2); \
s14 = rotate(s31, (ulong)55); \
s31 = rotate(s13, (ulong)45); \
s13 = rotate(s01, (ulong)36); \
s01 = rotate(s30, (ulong)28); \
s30 = rotate(s33, (ulong)21); \
s33 = rotate(s23, (ulong)15); \
s23 = rotate(s12, (ulong)10); \
s12 = rotate(s21, (ulong) 6); \
s21 = rotate(s02, (ulong) 3); \
s02 = t0; \
}

#define KHI(s00, s01, s02, s03, s04, \
s10, s11, s12, s13, s14, \
s20, s21, s22, s23, s24, \
s30, s31, s32, s33, s34, \
s40, s41, s42, s43, s44) \
{ \
t0 = s00 ^ (~s10 & s20); \
t1 = s10 ^ (~s20 & s30); \
t2 = s20 ^ (~s30 & s40); \
t3 = s30 ^ (~s40 & s00); \
t4 = s40 ^ (~s00 & s10); \
s00 = t0; s10 = t1; s20 = t2; s30 = t3; s40 = t4; \
\
t0 = s01 ^ (~s11 & s21); \
t1 = s11 ^ (~s21 & s31); \
t2 = s21 ^ (~s31 & s41); \
t3 = s31 ^ (~s41 & s01); \
t4 = s41 ^ (~s01 & s11); \
s01 = t0; s11 = t1; s21 = t2; s31 = t3; s41 = t4; \
\
t0 = s02 ^ (~s12 & s22); \
t1 = s12 ^ (~s22 & s32); \
t2 = s22 ^ (~s32 & s42); \
t3 = s32 ^ (~s42 & s02); \
t4 = s42 ^ (~s02 & s12); \
s02 = t0; s12 = t1; s22 = t2; s32 = t3; s42 = t4; \
\
t0 = s03 ^ (~s13 & s23); \
t1 = s13 ^ (~s23 & s33); \
t2 = s23 ^ (~s33 & s43); \
t3 = s33 ^ (~s43 & s03); \
t4 = s43 ^ (~s03 & s13); \
s03 = t0; s13 = t1; s23 = t2; s33 = t3; s43 = t4; \
\
t0 = s04 ^ (~s14 & s24); \
t1 = s14 ^ (~s24 & s34); \
t2 = s24 ^ (~s34 & s44); \
t3 = s34 ^ (~s44 & s04); \
t4 = s44 ^ (~s04 & s14); \
s04 = t0; s14 = t1; s24 = t2; s34 = t3; s44 = t4; \
}

#define IOTA(s00, r) { s00 ^= r; }

__constant ulong keccakf_rndc[24] = {
0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, 0x8000000080008008
};

// Barely a bottleneck. No need to tinker more.
void sha3_keccakf(ethhash * const h)
{
ulong * const st = &h->q;
h->d[33] ^= 0x80000000;
ulong t0, t1, t2, t3, t4;

// Unrolling and removing PI stage gave negligable performance on GTX 1070.
for (int i = 0; i < 24; ++i) {
THETA(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
RHOPI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
KHI(st[0], st[5], st[10], st[15], st[20], st[1], st[6], st[11], st[16], st[21], st[2], st[7], st[12], st[17], st[22], st[3], st[8], st[13], st[18], st[23], st[4], st[9], st[14], st[19], st[24]);
IOTA(st[0], keccakf_rndc[i]);
}
}
Loading

0 comments on commit a268e49

Please sign in to comment.