diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl index 05a3464de..f18f75250 100644 --- a/OpenCL/inc_hash_scrypt.cl +++ b/OpenCL/inc_hash_scrypt.cl @@ -9,441 +9,288 @@ #include "inc_common.h" #include "inc_hash_scrypt.h" -DECLSPEC void salsa_r_l (LOCAL_AS u32 *TI) +DECLSPEC void salsa_r (PRIVATE_AS u32 *TI) { - u32 x[16]; + u32 TT[STATE_CNT4/2]; // we actually nned 16 here for SALSA, but we reuse that buffer for blockmix - for (int j = 0; j < 16; j++) x[j] = TI[STATE_CNT - 16 + j]; + for (int j = 0; j < SALSA_CNT4; j++) TT[j] = TI[STATE_CNT4 - 16 + j]; - for (int i = 0; i < STATE_CNT; i += 16) + for (int i = 0; i < STATE_CNT4; i += SALSA_CNT4) { - for (int j = 0; j < 16; j++) - { - x[j] ^= TI[i + j]; - } + for (int j = 0; j < SALSA_CNT4; j++) TT[j] ^= TI[i + j]; - for (int j = 0; j < 16; j++) - { - TI[i + j] = x[j]; - } + for (int j = 0; j < SALSA_CNT4; j++) TI[i + j] = TT[j]; for (int r = 0; r < 4; r++) { u32 t0, t1, t2, t3; - t0 = x[ 0] + x[12]; - t1 = x[ 1] + x[13]; - t2 = x[ 2] + x[14]; - t3 = x[ 3] + x[15]; - x[ 4] ^= hc_rotl32_S (t0, 7); - x[ 5] ^= hc_rotl32_S (t1, 7); - x[ 6] ^= hc_rotl32_S (t2, 7); - x[ 7] ^= hc_rotl32_S (t3, 7); + t0 = TT[ 0] + TT[12]; + t1 = TT[ 1] + TT[13]; + t2 = TT[ 2] + TT[14]; + t3 = TT[ 3] + TT[15]; + TT[ 4] ^= hc_rotl32_S (t0, 7); + TT[ 5] ^= hc_rotl32_S (t1, 7); + TT[ 6] ^= hc_rotl32_S (t2, 7); + TT[ 7] ^= hc_rotl32_S (t3, 7); - t0 = x[ 4] + x[ 0]; - t1 = x[ 5] + x[ 1]; - t2 = x[ 6] + x[ 2]; - t3 = x[ 7] + x[ 3]; - x[ 8] ^= hc_rotl32_S (t0, 9); - x[ 9] ^= hc_rotl32_S (t1, 9); - x[10] ^= hc_rotl32_S (t2, 9); - x[11] ^= hc_rotl32_S (t3, 9); + t0 = TT[ 4] + TT[ 0]; + t1 = TT[ 5] + TT[ 1]; + t2 = TT[ 6] + TT[ 2]; + t3 = TT[ 7] + TT[ 3]; + TT[ 8] ^= hc_rotl32_S (t0, 9); + TT[ 9] ^= hc_rotl32_S (t1, 9); + TT[10] ^= hc_rotl32_S (t2, 9); + TT[11] ^= hc_rotl32_S (t3, 9); - t0 = x[ 8] + x[ 4]; - t1 = x[ 9] + x[ 5]; - t2 = x[10] + x[ 6]; - t3 = x[11] + x[ 7]; - x[12] ^= hc_rotl32_S (t0, 13); - x[13] ^= hc_rotl32_S (t1, 13); - x[14] ^= hc_rotl32_S (t2, 13); - x[15] ^= hc_rotl32_S (t3, 13); + t0 = TT[ 8] + TT[ 4]; + t1 = TT[ 9] + TT[ 5]; + t2 = TT[10] + TT[ 6]; + t3 = TT[11] + TT[ 7]; + TT[12] ^= hc_rotl32_S (t0, 13); + TT[13] ^= hc_rotl32_S (t1, 13); + TT[14] ^= hc_rotl32_S (t2, 13); + TT[15] ^= hc_rotl32_S (t3, 13); - t0 = x[12] + x[ 8]; - t1 = x[13] + x[ 9]; - t2 = x[14] + x[10]; - t3 = x[15] + x[11]; - x[ 0] ^= hc_rotl32_S (t0, 18); - x[ 1] ^= hc_rotl32_S (t1, 18); - x[ 2] ^= hc_rotl32_S (t2, 18); - x[ 3] ^= hc_rotl32_S (t3, 18); + t0 = TT[12] + TT[ 8]; + t1 = TT[13] + TT[ 9]; + t2 = TT[14] + TT[10]; + t3 = TT[15] + TT[11]; + TT[ 0] ^= hc_rotl32_S (t0, 18); + TT[ 1] ^= hc_rotl32_S (t1, 18); + TT[ 2] ^= hc_rotl32_S (t2, 18); + TT[ 3] ^= hc_rotl32_S (t3, 18); - t0 = x[ 4]; x[ 4] = x[ 7]; x[ 7] = x[ 6]; x[ 6] = x[ 5]; x[ 5] = t0; - t0 = x[ 8]; x[ 8] = x[10]; x[10] = t0; - t0 = x[ 9]; x[ 9] = x[11]; x[11] = t0; - t0 = x[12]; x[12] = x[13]; x[13] = x[14]; x[14] = x[15]; x[15] = t0; + t0 = TT[ 4]; TT[ 4] = TT[ 7]; TT[ 7] = TT[ 6]; TT[ 6] = TT[ 5]; TT[ 5] = t0; + t0 = TT[ 8]; TT[ 8] = TT[10]; TT[10] = t0; + t0 = TT[ 9]; TT[ 9] = TT[11]; TT[11] = t0; + t0 = TT[12]; TT[12] = TT[13]; TT[13] = TT[14]; TT[14] = TT[15]; TT[15] = t0; - t0 = x[ 0] + x[ 4]; - t1 = x[ 1] + x[ 5]; - t2 = x[ 2] + x[ 6]; - t3 = x[ 3] + x[ 7]; - x[12] ^= hc_rotl32_S (t0, 7); - x[13] ^= hc_rotl32_S (t1, 7); - x[14] ^= hc_rotl32_S (t2, 7); - x[15] ^= hc_rotl32_S (t3, 7); + t0 = TT[ 0] + TT[ 4]; + t1 = TT[ 1] + TT[ 5]; + t2 = TT[ 2] + TT[ 6]; + t3 = TT[ 3] + TT[ 7]; + TT[12] ^= hc_rotl32_S (t0, 7); + TT[13] ^= hc_rotl32_S (t1, 7); + TT[14] ^= hc_rotl32_S (t2, 7); + TT[15] ^= hc_rotl32_S (t3, 7); - t0 = x[12] + x[ 0]; - t1 = x[13] + x[ 1]; - t2 = x[14] + x[ 2]; - t3 = x[15] + x[ 3]; - x[ 8] ^= hc_rotl32_S (t0, 9); - x[ 9] ^= hc_rotl32_S (t1, 9); - x[10] ^= hc_rotl32_S (t2, 9); - x[11] ^= hc_rotl32_S (t3, 9); + t0 = TT[12] + TT[ 0]; + t1 = TT[13] + TT[ 1]; + t2 = TT[14] + TT[ 2]; + t3 = TT[15] + TT[ 3]; + TT[ 8] ^= hc_rotl32_S (t0, 9); + TT[ 9] ^= hc_rotl32_S (t1, 9); + TT[10] ^= hc_rotl32_S (t2, 9); + TT[11] ^= hc_rotl32_S (t3, 9); - t0 = x[ 8] + x[12]; - t1 = x[ 9] + x[13]; - t2 = x[10] + x[14]; - t3 = x[11] + x[15]; - x[ 4] ^= hc_rotl32_S (t0, 13); - x[ 5] ^= hc_rotl32_S (t1, 13); - x[ 6] ^= hc_rotl32_S (t2, 13); - x[ 7] ^= hc_rotl32_S (t3, 13); + t0 = TT[ 8] + TT[12]; + t1 = TT[ 9] + TT[13]; + t2 = TT[10] + TT[14]; + t3 = TT[11] + TT[15]; + TT[ 4] ^= hc_rotl32_S (t0, 13); + TT[ 5] ^= hc_rotl32_S (t1, 13); + TT[ 6] ^= hc_rotl32_S (t2, 13); + TT[ 7] ^= hc_rotl32_S (t3, 13); - t0 = x[ 4] + x[ 8]; - t1 = x[ 5] + x[ 9]; - t2 = x[ 6] + x[10]; - t3 = x[ 7] + x[11]; - x[ 0] ^= hc_rotl32_S (t0, 18); - x[ 1] ^= hc_rotl32_S (t1, 18); - x[ 2] ^= hc_rotl32_S (t2, 18); - x[ 3] ^= hc_rotl32_S (t3, 18); + t0 = TT[ 4] + TT[ 8]; + t1 = TT[ 5] + TT[ 9]; + t2 = TT[ 6] + TT[10]; + t3 = TT[ 7] + TT[11]; + TT[ 0] ^= hc_rotl32_S (t0, 18); + TT[ 1] ^= hc_rotl32_S (t1, 18); + TT[ 2] ^= hc_rotl32_S (t2, 18); + TT[ 3] ^= hc_rotl32_S (t3, 18); - t0 = x[ 4]; x[ 4] = x[ 5]; x[ 5] = x[ 6]; x[ 6] = x[ 7]; x[ 7] = t0; - t0 = x[ 8]; x[ 8] = x[10]; x[10] = t0; - t0 = x[ 9]; x[ 9] = x[11]; x[11] = t0; - t0 = x[15]; x[15] = x[14]; x[14] = x[13]; x[13] = x[12]; x[12] = t0; + t0 = TT[ 4]; TT[ 4] = TT[ 5]; TT[ 5] = TT[ 6]; TT[ 6] = TT[ 7]; TT[ 7] = t0; + t0 = TT[ 8]; TT[ 8] = TT[10]; TT[10] = t0; + t0 = TT[ 9]; TT[ 9] = TT[11]; TT[11] = t0; + t0 = TT[15]; TT[15] = TT[14]; TT[14] = TT[13]; TT[13] = TT[12]; TT[12] = t0; } - for (int j = 0; j < 16; j++) - { - x[j] += TI[i + j]; - } + for (int j = 0; j < SALSA_CNT4; j++) TT[j] += TI[i + j]; - for (int j = 0; j < 16; j++) - { - TI[i + j] = x[j]; - } + for (int j = 0; j < SALSA_CNT4; j++) TI[i + j] = TT[j]; } #if SCRYPT_R > 1 - u32 TT[STATE_CNT / 2]; - - for (int dst_off = 0, src_off = 16; src_off < STATE_CNT; dst_off += 16, src_off += 32) + for (int dst_off = 0, src_off = SALSA_CNT4; src_off < STATE_CNT4; dst_off += SALSA_CNT4, src_off += SALSA_CNT4 * 2) { - for (int j = 0; j < 16; j++) TT[dst_off + j] = TI[src_off + j]; + for (int j = 0; j < SALSA_CNT4; j++) TT[dst_off + j] = TI[src_off + j]; } - for (int dst_off = 16, src_off = 32; src_off < STATE_CNT; dst_off += 16, src_off += 32) + for (int dst_off = SALSA_CNT4, src_off = SALSA_CNT4 * 2; src_off < STATE_CNT4; dst_off += SALSA_CNT4, src_off += SALSA_CNT4 * 2) { - for (int j = 0; j < 16; j++) TI[dst_off + j] = TI[src_off + j]; + for (int j = 0; j < SALSA_CNT4; j++) TI[dst_off + j] = TI[src_off + j]; } - for (int dst_off = STATE_CNT / 2, src_off = 0; dst_off < STATE_CNT; dst_off += 16, src_off += 16) + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += SALSA_CNT4, src_off += SALSA_CNT4) { - for (int j = 0; j < 16; j++) TI[dst_off + j] = TT[src_off + j]; + for (int j = 0; j < SALSA_CNT4; j++) TI[dst_off + j] = TT[src_off + j]; } #endif } -DECLSPEC void salsa_r_p (PRIVATE_AS u32 *TI) -{ - u32 x[16]; - - for (int j = 0; j < 16; j++) x[j] = TI[STATE_CNT - 16 + j]; - - for (int i = 0; i < STATE_CNT; i += 16) - { - for (int j = 0; j < 16; j++) - { - x[j] ^= TI[i + j]; - } - - for (int j = 0; j < 16; j++) - { - TI[i + j] = x[j]; - } - - for (int r = 0; r < 4; r++) - { - u32 t0, t1, t2, t3; - - t0 = x[ 0] + x[12]; - t1 = x[ 1] + x[13]; - t2 = x[ 2] + x[14]; - t3 = x[ 3] + x[15]; - x[ 4] ^= hc_rotl32_S (t0, 7); - x[ 5] ^= hc_rotl32_S (t1, 7); - x[ 6] ^= hc_rotl32_S (t2, 7); - x[ 7] ^= hc_rotl32_S (t3, 7); - - t0 = x[ 4] + x[ 0]; - t1 = x[ 5] + x[ 1]; - t2 = x[ 6] + x[ 2]; - t3 = x[ 7] + x[ 3]; - x[ 8] ^= hc_rotl32_S (t0, 9); - x[ 9] ^= hc_rotl32_S (t1, 9); - x[10] ^= hc_rotl32_S (t2, 9); - x[11] ^= hc_rotl32_S (t3, 9); - - t0 = x[ 8] + x[ 4]; - t1 = x[ 9] + x[ 5]; - t2 = x[10] + x[ 6]; - t3 = x[11] + x[ 7]; - x[12] ^= hc_rotl32_S (t0, 13); - x[13] ^= hc_rotl32_S (t1, 13); - x[14] ^= hc_rotl32_S (t2, 13); - x[15] ^= hc_rotl32_S (t3, 13); - - t0 = x[12] + x[ 8]; - t1 = x[13] + x[ 9]; - t2 = x[14] + x[10]; - t3 = x[15] + x[11]; - x[ 0] ^= hc_rotl32_S (t0, 18); - x[ 1] ^= hc_rotl32_S (t1, 18); - x[ 2] ^= hc_rotl32_S (t2, 18); - x[ 3] ^= hc_rotl32_S (t3, 18); - - t0 = x[ 4]; x[ 4] = x[ 7]; x[ 7] = x[ 6]; x[ 6] = x[ 5]; x[ 5] = t0; - t0 = x[ 8]; x[ 8] = x[10]; x[10] = t0; - t0 = x[ 9]; x[ 9] = x[11]; x[11] = t0; - t0 = x[12]; x[12] = x[13]; x[13] = x[14]; x[14] = x[15]; x[15] = t0; - - t0 = x[ 0] + x[ 4]; - t1 = x[ 1] + x[ 5]; - t2 = x[ 2] + x[ 6]; - t3 = x[ 3] + x[ 7]; - x[12] ^= hc_rotl32_S (t0, 7); - x[13] ^= hc_rotl32_S (t1, 7); - x[14] ^= hc_rotl32_S (t2, 7); - x[15] ^= hc_rotl32_S (t3, 7); - - t0 = x[12] + x[ 0]; - t1 = x[13] + x[ 1]; - t2 = x[14] + x[ 2]; - t3 = x[15] + x[ 3]; - x[ 8] ^= hc_rotl32_S (t0, 9); - x[ 9] ^= hc_rotl32_S (t1, 9); - x[10] ^= hc_rotl32_S (t2, 9); - x[11] ^= hc_rotl32_S (t3, 9); - - t0 = x[ 8] + x[12]; - t1 = x[ 9] + x[13]; - t2 = x[10] + x[14]; - t3 = x[11] + x[15]; - x[ 4] ^= hc_rotl32_S (t0, 13); - x[ 5] ^= hc_rotl32_S (t1, 13); - x[ 6] ^= hc_rotl32_S (t2, 13); - x[ 7] ^= hc_rotl32_S (t3, 13); - - t0 = x[ 4] + x[ 8]; - t1 = x[ 5] + x[ 9]; - t2 = x[ 6] + x[10]; - t3 = x[ 7] + x[11]; - x[ 0] ^= hc_rotl32_S (t0, 18); - x[ 1] ^= hc_rotl32_S (t1, 18); - x[ 2] ^= hc_rotl32_S (t2, 18); - x[ 3] ^= hc_rotl32_S (t3, 18); - - t0 = x[ 4]; x[ 4] = x[ 5]; x[ 5] = x[ 6]; x[ 6] = x[ 7]; x[ 7] = t0; - t0 = x[ 8]; x[ 8] = x[10]; x[10] = t0; - t0 = x[ 9]; x[ 9] = x[11]; x[11] = t0; - t0 = x[15]; x[15] = x[14]; x[14] = x[13]; x[13] = x[12]; x[12] = t0; - } - - for (int j = 0; j < 16; j++) - { - x[j] += TI[i + j]; - } - - for (int j = 0; j < 16; j++) - { - TI[i + j] = x[j]; - } - } - - #if SCRYPT_R > 1 - - u32 TT[STATE_CNT / 2]; - - for (int dst_off = 0, src_off = 16; src_off < STATE_CNT; dst_off += 16, src_off += 32) - { - for (int j = 0; j < 16; j++) TT[dst_off + j] = TI[src_off + j]; - } - - for (int dst_off = 16, src_off = 32; src_off < STATE_CNT; dst_off += 16, src_off += 32) - { - for (int j = 0; j < 16; j++) TI[dst_off + j] = TI[src_off + j]; - } - - for (int dst_off = STATE_CNT / 2, src_off = 0; dst_off < STATE_CNT; dst_off += 16, src_off += 16) - { - for (int j = 0; j < 16; j++) TI[dst_off + j] = TT[src_off + j]; - } - - #endif -} - -#ifdef IS_HIP -DECLSPEC void scrypt_smix_init (LOCAL_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid, const u64 lid, const u64 lsz, const u64 bid) -#else -DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid, const u64 lid, const u64 lsz, const u64 bid) -#endif +DECLSPEC void scrypt_smix_init (PRIVATE_AS u32 *X, GLOBAL_AS void *V0, GLOBAL_AS void *V1, GLOBAL_AS void *V2, GLOBAL_AS void *V3, const u32 gid, const u32 lid, const u32 lsz, const u32 bid) { const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO; - const u32 zSIZE = STATE_CNT4; + const u32 zSIZE = STATE_CNT44; const u32 xd4 = bid / 4; const u32 xm4 = bid & 3; + PRIVATE_AS uint4 *X4 = (PRIVATE_AS uint4 *) X; + GLOBAL_AS uint4 *V; switch (xm4) { - case 0: V = V0; break; - case 1: V = V1; break; - case 2: V = V2; break; - case 3: V = V3; break; + case 0: V = (GLOBAL_AS uint4 *) V0; break; + case 1: V = (GLOBAL_AS uint4 *) V1; break; + case 2: V = (GLOBAL_AS uint4 *) V2; break; + case 3: V = (GLOBAL_AS uint4 *) V3; break; } for (u32 y = 0; y < ySIZE; y++) { - for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X[z]; + for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X4[z]; - #ifdef IS_HIP - for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_l ((LOCAL_AS u32 *) X); - #else - for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_p ((PRIVATE_AS u32 *) X); - #endif + for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r (X); } } -#ifdef IS_HIP -DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, LOCAL_AS uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid, const u64 lid, const u64 lsz, const u64 bid) -#else -DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid, const u64 lid, const u64 lsz, const u64 bid) -#endif +DECLSPEC void scrypt_smix_loop (PRIVATE_AS u32 *X, PRIVATE_AS u32 *T, GLOBAL_AS void *V0, GLOBAL_AS void *V1, GLOBAL_AS void *V2, GLOBAL_AS void *V3, const u32 gid, const u32 lid, const u32 lsz, const u32 bid) { const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO; - const u32 zSIZE = STATE_CNT4; + const u32 zSIZE = STATE_CNT44; const u32 xd4 = bid / 4; const u32 xm4 = bid & 3; + PRIVATE_AS uint4 *X4 = (PRIVATE_AS uint4 *) X; + PRIVATE_AS uint4 *T4 = (PRIVATE_AS uint4 *) T; + GLOBAL_AS uint4 *V; switch (xm4) { - case 0: V = V0; break; - case 1: V = V1; break; - case 2: V = V2; break; - case 3: V = V3; break; + case 0: V = (GLOBAL_AS uint4 *) V0; break; + case 1: V = (GLOBAL_AS uint4 *) V1; break; + case 2: V = (GLOBAL_AS uint4 *) V2; break; + case 3: V = (GLOBAL_AS uint4 *) V3; break; } - // note: max 2048 iterations = forced -u 2048 + // note: max 1024 iterations = forced -u 2048 const u32 N_max = (SCRYPT_N < 2048) ? SCRYPT_N : 2048; for (u32 N_pos = 0; N_pos < N_max; N_pos++) { - const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); + const u32 k = X4[zSIZE - 4].x & (SCRYPT_N - 1); const u32 y = k >> SCRYPT_TMTO; const u32 km = k - (y << SCRYPT_TMTO); - for (u32 z = 0; z < zSIZE; z++) T[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)]; + for (u32 z = 0; z < zSIZE; z++) T4[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)]; - #ifdef IS_HIP - for (u32 i = 0; i < km; i++) salsa_r_l ((LOCAL_AS u32 *) T); - #else - for (u32 i = 0; i < km; i++) salsa_r_p ((PRIVATE_AS u32 *) T); - #endif + for (u32 i = 0; i < km; i++) salsa_r (T); - for (u32 z = 0; z < zSIZE; z++) X[z] ^= T[z]; + for (u32 z = 0; z < zSIZE; z++) X4[z] = X4[z] ^ T4[z]; - salsa_r_p ((PRIVATE_AS u32 *) X); + salsa_r (X); } } -DECLSPEC void scrypt_blockmix_in (GLOBAL_AS uint4 *out_buf, const int out_len) +DECLSPEC void scrypt_blockmix_in (PRIVATE_AS u32 *out_buf, const int out_len) { - for (int i = 0, j = 0; i < out_len; i += 64, j += 4) + for (int i = 0, j = 0; i < out_len; i += SALSA_SZ, j += SALSA_CNT4) { - uint4 T[4]; + u32 X[SALSA_CNT4]; - T[0] = out_buf[j + 0]; - T[1] = out_buf[j + 1]; - T[2] = out_buf[j + 2]; - T[3] = out_buf[j + 3]; + X[ 0] = out_buf[j + 0]; + X[ 1] = out_buf[j + 5]; + X[ 2] = out_buf[j + 10]; + X[ 3] = out_buf[j + 15]; + X[ 4] = out_buf[j + 4]; + X[ 5] = out_buf[j + 9]; + X[ 6] = out_buf[j + 14]; + X[ 7] = out_buf[j + 3]; + X[ 8] = out_buf[j + 8]; + X[ 9] = out_buf[j + 13]; + X[10] = out_buf[j + 2]; + X[11] = out_buf[j + 7]; + X[12] = out_buf[j + 12]; + X[13] = out_buf[j + 1]; + X[14] = out_buf[j + 6]; + X[15] = out_buf[j + 11]; - uint4 X[4]; - - #if defined IS_CUDA || defined IS_HIP - X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w); - X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w); - X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w); - X[3] = make_uint4 (T[3].x, T[0].y, T[1].z, T[2].w); - #elif defined IS_METAL - X[0] = uint4 (T[0].x, T[1].y, T[2].z, T[3].w); - X[1] = uint4 (T[1].x, T[2].y, T[3].z, T[0].w); - X[2] = uint4 (T[2].x, T[3].y, T[0].z, T[1].w); - X[3] = uint4 (T[3].x, T[0].y, T[1].z, T[2].w); - #else - X[0] = (uint4) (T[0].x, T[1].y, T[2].z, T[3].w); - X[1] = (uint4) (T[1].x, T[2].y, T[3].z, T[0].w); - X[2] = (uint4) (T[2].x, T[3].y, T[0].z, T[1].w); - X[3] = (uint4) (T[3].x, T[0].y, T[1].z, T[2].w); - #endif - - out_buf[j + 0] = X[0]; - out_buf[j + 1] = X[1]; - out_buf[j + 2] = X[2]; - out_buf[j + 3] = X[3]; + out_buf[j + 0] = X[ 0]; + out_buf[j + 1] = X[ 1]; + out_buf[j + 2] = X[ 2]; + out_buf[j + 3] = X[ 3]; + out_buf[j + 4] = X[ 4]; + out_buf[j + 5] = X[ 5]; + out_buf[j + 6] = X[ 6]; + out_buf[j + 7] = X[ 7]; + out_buf[j + 8] = X[ 8]; + out_buf[j + 9] = X[ 9]; + out_buf[j + 10] = X[10]; + out_buf[j + 11] = X[11]; + out_buf[j + 12] = X[12]; + out_buf[j + 13] = X[13]; + out_buf[j + 14] = X[14]; + out_buf[j + 15] = X[15]; } } -DECLSPEC void scrypt_blockmix_out (GLOBAL_AS uint4 *out_buf, const int out_len) +DECLSPEC void scrypt_blockmix_out (PRIVATE_AS u32 *out_buf, const int out_len) { - for (int i = 0, j = 0; i < out_len; i += 64, j += 4) + for (int i = 0, j = 0; i < out_len; i += SALSA_SZ, j += SALSA_CNT4) { - uint4 X[4]; + u32 T[SALSA_CNT4]; - X[0] = out_buf[j + 0]; - X[1] = out_buf[j + 1]; - X[2] = out_buf[j + 2]; - X[3] = out_buf[j + 3]; + T[ 0] = out_buf[j + 0]; + T[ 1] = out_buf[j + 13]; + T[ 2] = out_buf[j + 10]; + T[ 3] = out_buf[j + 7]; + T[ 4] = out_buf[j + 4]; + T[ 5] = out_buf[j + 1]; + T[ 6] = out_buf[j + 14]; + T[ 7] = out_buf[j + 11]; + T[ 8] = out_buf[j + 8]; + T[ 9] = out_buf[j + 5]; + T[10] = out_buf[j + 2]; + T[11] = out_buf[j + 15]; + T[12] = out_buf[j + 12]; + T[13] = out_buf[j + 9]; + T[14] = out_buf[j + 6]; + T[15] = out_buf[j + 3]; - uint4 T[4]; - - #if defined IS_CUDA || defined IS_HIP - T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w); - T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w); - T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w); - T[3] = make_uint4 (X[3].x, X[2].y, X[1].z, X[0].w); - #elif defined IS_METAL - T[0] = uint4 (X[0].x, X[3].y, X[2].z, X[1].w); - T[1] = uint4 (X[1].x, X[0].y, X[3].z, X[2].w); - T[2] = uint4 (X[2].x, X[1].y, X[0].z, X[3].w); - T[3] = uint4 (X[3].x, X[2].y, X[1].z, X[0].w); - #else - T[0] = (uint4) (X[0].x, X[3].y, X[2].z, X[1].w); - T[1] = (uint4) (X[1].x, X[0].y, X[3].z, X[2].w); - T[2] = (uint4) (X[2].x, X[1].y, X[0].z, X[3].w); - T[3] = (uint4) (X[3].x, X[2].y, X[1].z, X[0].w); - #endif - - out_buf[j + 0] = T[0]; - out_buf[j + 1] = T[1]; - out_buf[j + 2] = T[2]; - out_buf[j + 3] = T[3]; + out_buf[j + 0] = T[ 0]; + out_buf[j + 1] = T[ 1]; + out_buf[j + 2] = T[ 2]; + out_buf[j + 3] = T[ 3]; + out_buf[j + 4] = T[ 4]; + out_buf[j + 5] = T[ 5]; + out_buf[j + 6] = T[ 6]; + out_buf[j + 7] = T[ 7]; + out_buf[j + 8] = T[ 8]; + out_buf[j + 9] = T[ 9]; + out_buf[j + 10] = T[10]; + out_buf[j + 11] = T[11]; + out_buf[j + 12] = T[12]; + out_buf[j + 13] = T[13]; + out_buf[j + 14] = T[14]; + out_buf[j + 15] = T[15]; } } -DECLSPEC void scrypt_pbkdf2_body (PRIVATE_AS sha256_hmac_ctx_t *sha256_hmac_ctx, GLOBAL_AS uint4 *out_buf, const int out_len) +DECLSPEC void scrypt_pbkdf2_body (PRIVATE_AS sha256_hmac_ctx_t *sha256_hmac_ctx, PRIVATE_AS u32 *out_buf, const int out_len) { - for (int i = 0, j = 1, k = 0; i < out_len; i += 32, j += 1, k += 2) + for (int i = 0, j = 1, k = 0; i < out_len; i += 32, j += 1, k += 8) { sha256_hmac_ctx_t sha256_hmac_ctx2 = *sha256_hmac_ctx; @@ -473,34 +320,56 @@ DECLSPEC void scrypt_pbkdf2_body (PRIVATE_AS sha256_hmac_ctx_t *sha256_hmac_ctx, sha256_hmac_final (&sha256_hmac_ctx2); - u32 digest[8]; + // this will not work if user specifies output length not a multiple of 4 + // probably never happens... + // let's hope the compiler will auto optimize this since out_len is very likely + // a constant at caller level - digest[0] = hc_swap32_S (sha256_hmac_ctx2.opad.h[0]); - digest[1] = hc_swap32_S (sha256_hmac_ctx2.opad.h[1]); - digest[2] = hc_swap32_S (sha256_hmac_ctx2.opad.h[2]); - digest[3] = hc_swap32_S (sha256_hmac_ctx2.opad.h[3]); - digest[4] = hc_swap32_S (sha256_hmac_ctx2.opad.h[4]); - digest[5] = hc_swap32_S (sha256_hmac_ctx2.opad.h[5]); - digest[6] = hc_swap32_S (sha256_hmac_ctx2.opad.h[6]); - digest[7] = hc_swap32_S (sha256_hmac_ctx2.opad.h[7]); - - #if defined IS_CUDA || defined IS_HIP - const uint4 tmp0 = make_uint4 (digest[0], digest[1], digest[2], digest[3]); - const uint4 tmp1 = make_uint4 (digest[4], digest[5], digest[6], digest[7]); - #elif defined IS_METAL - const uint4 tmp0 = uint4 (digest[0], digest[1], digest[2], digest[3]); - const uint4 tmp1 = uint4 (digest[4], digest[5], digest[6], digest[7]); - #else - const uint4 tmp0 = (uint4) (digest[0], digest[1], digest[2], digest[3]); - const uint4 tmp1 = (uint4) (digest[4], digest[5], digest[6], digest[7]); - #endif - - out_buf[k + 0] = tmp0; - out_buf[k + 1] = tmp1; + if (out_len >= (i + 4)) out_buf[k + 0] = hc_swap32_S (sha256_hmac_ctx2.opad.h[0]); + if (out_len >= (i + 8)) out_buf[k + 1] = hc_swap32_S (sha256_hmac_ctx2.opad.h[1]); + if (out_len >= (i + 12)) out_buf[k + 2] = hc_swap32_S (sha256_hmac_ctx2.opad.h[2]); + if (out_len >= (i + 16)) out_buf[k + 3] = hc_swap32_S (sha256_hmac_ctx2.opad.h[3]); + if (out_len >= (i + 20)) out_buf[k + 4] = hc_swap32_S (sha256_hmac_ctx2.opad.h[4]); + if (out_len >= (i + 24)) out_buf[k + 5] = hc_swap32_S (sha256_hmac_ctx2.opad.h[5]); + if (out_len >= (i + 28)) out_buf[k + 6] = hc_swap32_S (sha256_hmac_ctx2.opad.h[6]); + if (out_len >= (i + 32)) out_buf[k + 7] = hc_swap32_S (sha256_hmac_ctx2.opad.h[7]); } } -DECLSPEC void scrypt_pbkdf2 (GLOBAL_AS const u32 *pw_buf, const int pw_len, GLOBAL_AS const u32 *salt_buf, const int salt_len, GLOBAL_AS uint4 *out_buf, const int out_len) +DECLSPEC void scrypt_pbkdf2_pp (PRIVATE_AS const u32 *pw_buf, const int pw_len, PRIVATE_AS const u32 *salt_buf, const int salt_len, PRIVATE_AS u32 *out_buf, const int out_len) +{ + sha256_hmac_ctx_t sha256_hmac_ctx; + + sha256_hmac_init_swap (&sha256_hmac_ctx, pw_buf, pw_len); + + sha256_hmac_update_swap (&sha256_hmac_ctx, salt_buf, salt_len); + + scrypt_pbkdf2_body (&sha256_hmac_ctx, out_buf, out_len); +} + +DECLSPEC void scrypt_pbkdf2_pg (PRIVATE_AS const u32 *pw_buf, const int pw_len, GLOBAL_AS const u32 *salt_buf, const int salt_len, PRIVATE_AS u32 *out_buf, const int out_len) +{ + sha256_hmac_ctx_t sha256_hmac_ctx; + + sha256_hmac_init_swap (&sha256_hmac_ctx, pw_buf, pw_len); + + sha256_hmac_update_global_swap (&sha256_hmac_ctx, salt_buf, salt_len); + + scrypt_pbkdf2_body (&sha256_hmac_ctx, out_buf, out_len); +} + +DECLSPEC void scrypt_pbkdf2_gp (GLOBAL_AS const u32 *pw_buf, const int pw_len, PRIVATE_AS const u32 *salt_buf, const int salt_len, PRIVATE_AS u32 *out_buf, const int out_len) +{ + sha256_hmac_ctx_t sha256_hmac_ctx; + + sha256_hmac_init_global_swap (&sha256_hmac_ctx, pw_buf, pw_len); + + sha256_hmac_update_swap (&sha256_hmac_ctx, salt_buf, salt_len); + + scrypt_pbkdf2_body (&sha256_hmac_ctx, out_buf, out_len); +} + +DECLSPEC void scrypt_pbkdf2_gg (GLOBAL_AS const u32 *pw_buf, const int pw_len, GLOBAL_AS const u32 *salt_buf, const int salt_len, PRIVATE_AS u32 *out_buf, const int out_len) { sha256_hmac_ctx_t sha256_hmac_ctx; diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h index 6f3864cdd..9945c79a5 100644 --- a/OpenCL/inc_hash_scrypt.h +++ b/OpenCL/inc_hash_scrypt.h @@ -6,39 +6,40 @@ #ifndef INC_HASH_SCRYPT_H #define INC_HASH_SCRYPT_H -#define GET_SCRYPT_CNT(r,p) (2 * (r) * 16 * (p)) -#define GET_SMIX_CNT(r,N) (2 * (r) * 16 * (N)) -#define GET_STATE_CNT(r) (2 * (r) * 16) +#define GET_SCRYPT_SZ(r,p) (128 * (r) * (p)) +#define GET_STATE_SZ(r) (128 * (r)) -#define SCRYPT_CNT GET_SCRYPT_CNT (SCRYPT_R, SCRYPT_P) -#define SCRYPT_CNT4 (SCRYPT_CNT / 4) -#define STATE_CNT GET_STATE_CNT (SCRYPT_R) -#define STATE_CNT4 (STATE_CNT / 4) +// _SZ is true sizes as bytes +#define SCRYPT_SZ GET_SCRYPT_SZ (SCRYPT_R, SCRYPT_P) +#define STATE_SZ GET_STATE_SZ (SCRYPT_R) + +// _CNT is size as whatever /X datatype +#define SCRYPT_CNT4 (SCRYPT_SZ / 4) +#define STATE_CNT4 (STATE_SZ / 4) + +// this would be uint4, feels more natural than 16 +#define SCRYPT_CNT44 ((SCRYPT_SZ / 4) / 4) +#define STATE_CNT44 ((STATE_SZ / 4) / 4) + +#define SALSA_SZ 64 +#define SALSA_CNT4 (SALSA_SZ / 4) #define VIDX(bid4,lsz,lid,ySIZE,zSIZE,y,z) (((bid4) * (lsz) * (ySIZE) * (zSIZE)) + ((lid) * (ySIZE) * (zSIZE)) + ((y) * (zSIZE)) + (z)) #if defined IS_CUDA -inline __device__ uint4 operator & (const uint4 a, const u32 b) { return make_uint4 ((a.x & b ), (a.y & b ), (a.z & b ), (a.w & b )); } -inline __device__ uint4 operator << (const uint4 a, const u32 b) { return make_uint4 ((a.x << b ), (a.y << b ), (a.z << b ), (a.w << b )); } -inline __device__ uint4 operator >> (const uint4 a, const u32 b) { return make_uint4 ((a.x >> b ), (a.y >> b ), (a.z >> b ), (a.w >> b )); } -inline __device__ uint4 operator + (const uint4 a, const uint4 b) { return make_uint4 ((a.x + b.x), (a.y + b.y), (a.z + b.z), (a.w + b.w)); } -inline __device__ uint4 operator ^ (const uint4 a, const uint4 b) { return make_uint4 ((a.x ^ b.x), (a.y ^ b.y), (a.z ^ b.z), (a.w ^ b.w)); } -inline __device__ uint4 operator | (const uint4 a, const uint4 b) { return make_uint4 ((a.x | b.x), (a.y | b.y), (a.z | b.z), (a.w | b.w)); } -inline __device__ void operator ^= ( uint4 &a, const uint4 b) { a.x ^= b.x; a.y ^= b.y; a.z ^= b.z; a.w ^= b.w; } -#endif -#if defined IS_CUDA || defined IS_HIP -inline __device__ uint4 rotate (const uint4 a, const int n) +DECLSPEC uint4 operator ^ (const uint4 a, const uint4 b) { uint4 r; - r.x = hc_rotl32_S (r.x, n); - r.y = hc_rotl32_S (r.y, n); - r.z = hc_rotl32_S (r.z, n); - r.w = hc_rotl32_S (r.w, n); + r.x = a.x ^ b.x; + r.y = a.y ^ b.y; + r.z = a.z ^ b.z; + r.w = a.w ^ b.w; return r; } + #endif #endif diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 1125236dd..c04575066 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -131,7 +131,7 @@ DECLSPEC u32x rotl32 (const u32x a, const int n) return __funnelshift_l(a, a, n); #endif - u32x t; + u32x t = 0; #if VECT_SIZE >= 2 t.s0 = __funnelshift_l(a.s0, a.s0, n); @@ -171,7 +171,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n) return __funnelshift_r(a, a, n); #endif - u32x t; + u32x t = 0; #if VECT_SIZE >= 2 t.s0 = __funnelshift_r(a.s0, a.s0, n); diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h index 38afefebe..5d269beec 100644 --- a/OpenCL/inc_vendor.h +++ b/OpenCL/inc_vendor.h @@ -148,30 +148,20 @@ using namespace metal; #define HC_INLINE inline static #endif +#if defined IS_AMD && defined IS_GPU +#define DECLSPEC HC_INLINE +#elif defined IS_CUDA +#define DECLSPEC __device__ +#elif defined IS_HIP +#define DECLSPEC __device__ +#else +#define DECLSPEC +#endif + #if defined FIXED_LOCAL_SIZE #define HC_ATTR_SEQ FIXED_THREAD_COUNT((FIXED_LOCAL_SIZE)) #else -#if defined IS_AMD && defined IS_GPU #define HC_ATTR_SEQ -#define DECLSPEC HC_INLINE -#elif defined IS_HIP -#define HC_ATTR_SEQ __launch_bounds__((MAX_THREADS_PER_BLOCK), 0) -#define DECLSPEC __device__ HC_INLINE -#elif defined IS_CUDA -#define HC_ATTR_SEQ -#define DECLSPEC -#else -#define HC_ATTR_SEQ -#define DECLSPEC -#endif -#endif - -#if defined IS_AMD && defined IS_GPU -#define DECLSPEC HC_INLINE -#elif defined IS_HIP -#define DECLSPEC __device__ HC_INLINE -#else -#define DECLSPEC #endif /** diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl index 2cd5e1729..bc2730be9 100644 --- a/OpenCL/m08900-pure.cl +++ b/OpenCL/m08900-pure.cl @@ -21,7 +21,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -31,9 +31,13 @@ KERNEL_FQ void HC_ATTR_SEQ m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, tmps[gid].P, SCRYPT_CNT * 4); + u32 out[SCRYPT_CNT4]; - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + scrypt_pbkdf2_gg (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, out, SCRYPT_SZ); + + scrypt_blockmix_in (out, SCRYPT_SZ); + + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -45,27 +49,15 @@ KERNEL_FQ void HC_ATTR_SEQ m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -77,29 +69,16 @@ KERNEL_FQ void HC_ATTR_SEQ m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -108,14 +87,20 @@ KERNEL_FQ void HC_ATTR_SEQ m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); + u32 x[SCRYPT_CNT4]; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4, tmps[gid].P, 16); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; - const u32 r0 = tmps[gid].P[0].x; - const u32 r1 = tmps[gid].P[0].y; - const u32 r2 = tmps[gid].P[0].z; - const u32 r3 = tmps[gid].P[0].w; + scrypt_blockmix_out (x, SCRYPT_SZ); + + u32 out[4]; + + scrypt_pbkdf2_gp (pws[gid].i, pws[gid].pw_len, x, SCRYPT_SZ, out, 16); + + const u32 r0 = out[0]; + const u32 r1 = out[1]; + const u32 r2 = out[2]; + const u32 r3 = out[3]; #define il_pos 0 diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl index aa7e497ee..99e3bd78f 100644 --- a/OpenCL/m15700-pure.cl +++ b/OpenCL/m15700-pure.cl @@ -21,7 +21,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -167,9 +167,13 @@ KERNEL_FQ void HC_ATTR_SEQ m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethe if (gid >= GID_CNT) return; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, tmps[gid].P, SCRYPT_CNT * 4); + u32 out[SCRYPT_CNT4]; - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + scrypt_pbkdf2_gg (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, out, SCRYPT_SZ); + + scrypt_blockmix_in (out, SCRYPT_SZ); + + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m15700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -181,27 +185,15 @@ KERNEL_FQ void HC_ATTR_SEQ m15700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m15700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -213,29 +205,16 @@ KERNEL_FQ void HC_ATTR_SEQ m15700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) @@ -244,9 +223,15 @@ KERNEL_FQ void HC_ATTR_SEQ m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethe if (gid >= GID_CNT) return; - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); + u32 x[SCRYPT_CNT4]; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4, tmps[gid].P, 32); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; + + scrypt_blockmix_out (x, SCRYPT_SZ); + + u32 out[8]; + + scrypt_pbkdf2_gp (pws[gid].i, pws[gid].pw_len, x, SCRYPT_SZ, out, 32); /** * keccak @@ -265,10 +250,10 @@ KERNEL_FQ void HC_ATTR_SEQ m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethe u32 key[4]; - key[0] = tmps[gid].P[1].x; - key[1] = tmps[gid].P[1].y; - key[2] = tmps[gid].P[1].z; - key[3] = tmps[gid].P[1].w; + key[0] = out[4]; + key[1] = out[5]; + key[2] = out[6]; + key[3] = out[7]; u64 st[25]; diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl index be6c18cb9..e9873a21f 100644 --- a/OpenCL/m22700-pure.cl +++ b/OpenCL/m22700-pure.cl @@ -22,7 +22,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -96,23 +96,18 @@ KERNEL_FQ void HC_ATTR_SEQ m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - sha256_hmac_ctx_t sha256_hmac_ctx; + u32 s[16] = { 0 }; - sha256_hmac_init_swap (&sha256_hmac_ctx, w, w_len); + s[0] = hc_swap32_S (MULTIBIT_S0); + s[1] = hc_swap32_S (MULTIBIT_S1); - u32 x0[4] = { 0 }; - u32 x1[4] = { 0 }; - u32 x2[4] = { 0 }; - u32 x3[4] = { 0 }; + u32 out[SCRYPT_CNT4]; - x0[0] = MULTIBIT_S0; - x0[1] = MULTIBIT_S1; + scrypt_pbkdf2_pp (w, w_len, s, 8, out, SCRYPT_SZ); - sha256_hmac_update_64 (&sha256_hmac_ctx, x0, x1, x2, x3, 8); + scrypt_blockmix_in (out, SCRYPT_SZ); - scrypt_pbkdf2_body (&sha256_hmac_ctx, tmps[gid].P, SCRYPT_CNT * 4); - - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -124,27 +119,15 @@ KERNEL_FQ void HC_ATTR_SEQ m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -156,29 +139,16 @@ KERNEL_FQ void HC_ATTR_SEQ m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -261,28 +231,29 @@ KERNEL_FQ void HC_ATTR_SEQ m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); - sha256_hmac_ctx_t ctx; + u32 x[SCRYPT_CNT4]; - sha256_hmac_init_swap (&ctx, w, w_len); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; - sha256_hmac_update_global_swap (&ctx, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4); + scrypt_blockmix_out (x, SCRYPT_SZ); - scrypt_pbkdf2_body (&ctx, tmps[gid].P, 16); + u32 out[8]; + + scrypt_pbkdf2_pp (w, w_len, x, SCRYPT_SZ, out, 32); // AES256-CBC decrypt with IV from salt buffer (dynamic, alternative 1): u32 key[8]; - key[0] = tmps[gid].P[0].x; - key[1] = tmps[gid].P[0].y; - key[2] = tmps[gid].P[0].z; - key[3] = tmps[gid].P[0].w; - key[4] = tmps[gid].P[1].x; - key[5] = tmps[gid].P[1].y; - key[6] = tmps[gid].P[1].z; - key[7] = tmps[gid].P[1].w; + key[0] = out[0]; + key[1] = out[1]; + key[2] = out[2]; + key[3] = out[3]; + key[4] = out[4]; + key[5] = out[5]; + key[6] = out[6]; + key[7] = out[7]; #define KEYLEN 60 diff --git a/OpenCL/m24000-pure.cl b/OpenCL/m24000-pure.cl index d00e4c139..fa22ebab4 100644 --- a/OpenCL/m24000-pure.cl +++ b/OpenCL/m24000-pure.cl @@ -22,7 +22,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -171,17 +171,17 @@ KERNEL_FQ void HC_ATTR_SEQ m24000_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, best if (gid >= GID_CNT) return; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, tmps[gid].P, SCRYPT_CNT * 4); + u32 out[SCRYPT_CNT4]; - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + scrypt_pbkdf2_gg (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, out, SCRYPT_SZ); + + scrypt_blockmix_in (out, SCRYPT_SZ); + + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m24000_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, bestcrypt_scrypt_t)) { - /** - * base - */ - const u64 gid = get_global_id (0); const u64 lid = get_local_id (0); const u64 lsz = get_local_size (0); @@ -189,24 +189,15 @@ KERNEL_FQ void HC_ATTR_SEQ m24000_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp if (gid >= GID_CNT) return; - // SCRYPT part, init V + u32 X[STATE_CNT4]; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - uint4 X[STATE_CNT4]; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m24000_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, bestcrypt_scrypt_t)) @@ -218,29 +209,16 @@ KERNEL_FQ void HC_ATTR_SEQ m24000_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, best if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m24000_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, bestcrypt_scrypt_t)) @@ -305,9 +283,15 @@ KERNEL_FQ void HC_ATTR_SEQ m24000_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, best if (gid >= GID_CNT) return; - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); + u32 x[SCRYPT_CNT4]; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4, tmps[gid].P, 32); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; + + scrypt_blockmix_out (x, SCRYPT_SZ); + + u32 out[8]; + + scrypt_pbkdf2_gp (pws[gid].i, pws[gid].pw_len, x, SCRYPT_SZ, out, 32); u32 version = esalt_bufs[DIGESTS_OFFSET_HOST].version; @@ -317,14 +301,14 @@ KERNEL_FQ void HC_ATTR_SEQ m24000_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, best u32 key[8]; - key[0] = tmps[gid].P[0].x; - key[1] = tmps[gid].P[0].y; - key[2] = tmps[gid].P[0].z; - key[3] = tmps[gid].P[0].w; - key[4] = tmps[gid].P[1].x; - key[5] = tmps[gid].P[1].y; - key[6] = tmps[gid].P[1].z; - key[7] = tmps[gid].P[1].w; + key[0] = out[0]; + key[1] = out[1]; + key[2] = out[2]; + key[3] = out[3]; + key[4] = out[4]; + key[5] = out[5]; + key[6] = out[6]; + key[7] = out[7]; if (version == 0x38) //0x38 is char for '8' which is the crypto type passed in position 3 of hash ( $08$ ) { diff --git a/OpenCL/m27700-pure.cl b/OpenCL/m27700-pure.cl index 2bbe3ba01..a6482afdf 100644 --- a/OpenCL/m27700-pure.cl +++ b/OpenCL/m27700-pure.cl @@ -19,7 +19,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -46,23 +46,18 @@ KERNEL_FQ void HC_ATTR_SEQ m27700_init (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - sha256_hmac_ctx_t sha256_hmac_ctx; + u32 s[16] = { 0 }; - sha256_hmac_init_swap (&sha256_hmac_ctx, w, w_len); + s[0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[0]); + s[1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[1]); - u32 x0[4] = { 0 }; - u32 x1[4] = { 0 }; - u32 x2[4] = { 0 }; - u32 x3[4] = { 0 }; + u32 out[SCRYPT_CNT4]; - x0[0] = salt_bufs[SALT_POS_HOST].salt_buf[0]; - x0[1] = salt_bufs[SALT_POS_HOST].salt_buf[1]; + scrypt_pbkdf2_pp (w, w_len, s, 8, out, SCRYPT_SZ); - sha256_hmac_update_64 (&sha256_hmac_ctx, x0, x1, x2, x3, 8); + scrypt_blockmix_in (out, SCRYPT_SZ); - scrypt_pbkdf2_body (&sha256_hmac_ctx, tmps[gid].P, SCRYPT_CNT * 4); - - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m27700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -74,27 +69,15 @@ KERNEL_FQ void HC_ATTR_SEQ m27700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m27700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -106,29 +89,16 @@ KERNEL_FQ void HC_ATTR_SEQ m27700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -211,28 +181,29 @@ KERNEL_FQ void HC_ATTR_SEQ m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); - sha256_hmac_ctx_t ctx; + u32 x[SCRYPT_CNT4]; - sha256_hmac_init_swap (&ctx, w, w_len); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; - sha256_hmac_update_global_swap (&ctx, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4); + scrypt_blockmix_out (x, SCRYPT_SZ); - scrypt_pbkdf2_body (&ctx, tmps[gid].P, 16); + u32 out[8]; + + scrypt_pbkdf2_pp (w, w_len, x, SCRYPT_SZ, out, 32); // AES256-CBC decrypt u32 key[8]; - key[0] = tmps[gid].P[0].x; - key[1] = tmps[gid].P[0].y; - key[2] = tmps[gid].P[0].z; - key[3] = tmps[gid].P[0].w; - key[4] = tmps[gid].P[1].x; - key[5] = tmps[gid].P[1].y; - key[6] = tmps[gid].P[1].z; - key[7] = tmps[gid].P[1].w; + key[0] = out[0]; + key[1] = out[1]; + key[2] = out[2]; + key[3] = out[3]; + key[4] = out[4]; + key[5] = out[5]; + key[6] = out[6]; + key[7] = out[7]; #define KEYLEN 60 diff --git a/OpenCL/m28200-pure.cl b/OpenCL/m28200-pure.cl index 18f0cb161..e335b89bf 100644 --- a/OpenCL/m28200-pure.cl +++ b/OpenCL/m28200-pure.cl @@ -23,7 +23,7 @@ typedef struct exodus_tmp #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } exodus_tmp_t; @@ -41,9 +41,13 @@ KERNEL_FQ void HC_ATTR_SEQ m28200_init (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exod if (gid >= GID_CNT) return; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, tmps[gid].P, SCRYPT_CNT * 4); + u32 out[SCRYPT_CNT4]; - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + scrypt_pbkdf2_gg (pws[gid].i, pws[gid].pw_len, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len, out, SCRYPT_SZ); + + scrypt_blockmix_in (out, SCRYPT_SZ); + + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m28200_loop_prepare (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t)) @@ -55,27 +59,15 @@ KERNEL_FQ void HC_ATTR_SEQ m28200_loop_prepare (KERN_ATTR_TMPS_ESALT (exodus_tmp if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m28200_loop (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t)) @@ -87,29 +79,16 @@ KERNEL_FQ void HC_ATTR_SEQ m28200_loop (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exod if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t)) @@ -171,22 +150,28 @@ KERNEL_FQ void HC_ATTR_SEQ m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exod if (gid >= GID_CNT) return; - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); + u32 x[SCRYPT_CNT4]; - scrypt_pbkdf2 (pws[gid].i, pws[gid].pw_len, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4, tmps[gid].P, 32); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; + + scrypt_blockmix_out (x, SCRYPT_SZ); + + u32 out[8]; + + scrypt_pbkdf2_gp (pws[gid].i, pws[gid].pw_len, x, SCRYPT_SZ, out, 32); // GCM stuff u32 ukey[8]; - ukey[0] = hc_swap32_S (tmps[gid].P[0].x); - ukey[1] = hc_swap32_S (tmps[gid].P[0].y); - ukey[2] = hc_swap32_S (tmps[gid].P[0].z); - ukey[3] = hc_swap32_S (tmps[gid].P[0].w); - ukey[4] = hc_swap32_S (tmps[gid].P[1].x); - ukey[5] = hc_swap32_S (tmps[gid].P[1].y); - ukey[6] = hc_swap32_S (tmps[gid].P[1].z); - ukey[7] = hc_swap32_S (tmps[gid].P[1].w); + ukey[0] = hc_swap32_S (out[0]); + ukey[1] = hc_swap32_S (out[1]); + ukey[2] = hc_swap32_S (out[2]); + ukey[3] = hc_swap32_S (out[3]); + ukey[4] = hc_swap32_S (out[4]); + ukey[5] = hc_swap32_S (out[5]); + ukey[6] = hc_swap32_S (out[6]); + ukey[7] = hc_swap32_S (out[7]); u32 key[60] = { 0 }; u32 subKey[4] = { 0 }; diff --git a/OpenCL/m29800-pure.cl b/OpenCL/m29800-pure.cl index fb58a9822..98d3fca70 100644 --- a/OpenCL/m29800-pure.cl +++ b/OpenCL/m29800-pure.cl @@ -19,7 +19,7 @@ typedef struct #define SCRYPT_TMP_ELEM 1 #endif - uint4 P[SCRYPT_TMP_ELEM]; + u32 P[SCRYPT_TMP_ELEM]; } scrypt_tmp_t; @@ -46,23 +46,18 @@ KERNEL_FQ void HC_ATTR_SEQ m29800_init (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - sha256_hmac_ctx_t sha256_hmac_ctx; + u32 s[16] = { 0 }; - sha256_hmac_init_swap (&sha256_hmac_ctx, w, w_len); + s[0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[0]); + s[1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[1]); - u32 x0[4] = { 0 }; - u32 x1[4] = { 0 }; - u32 x2[4] = { 0 }; - u32 x3[4] = { 0 }; + u32 out[SCRYPT_CNT4]; - x0[0] = salt_bufs[SALT_POS_HOST].salt_buf[0]; - x0[1] = salt_bufs[SALT_POS_HOST].salt_buf[1]; + scrypt_pbkdf2_pp (w, w_len, s, 8, out, SCRYPT_SZ); - sha256_hmac_update_64 (&sha256_hmac_ctx, x0, x1, x2, x3, 8); + scrypt_blockmix_in (out, SCRYPT_SZ); - scrypt_pbkdf2_body (&sha256_hmac_ctx, tmps[gid].P, SCRYPT_CNT * 4); - - scrypt_blockmix_in (tmps[gid].P, SCRYPT_CNT * 4); + for (u32 i = 0; i < SCRYPT_CNT4; i++) tmps[gid].P[i] = out[i]; } KERNEL_FQ void HC_ATTR_SEQ m29800_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -74,27 +69,15 @@ KERNEL_FQ void HC_ATTR_SEQ m29800_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; - #ifdef IS_HIP - LOCAL_VK uint4 X_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *X = X_s[lid]; - #else - uint4 X[STATE_CNT4]; - #endif + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; + scrypt_smix_init (X, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m29800_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -106,29 +89,16 @@ KERNEL_FQ void HC_ATTR_SEQ m29800_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) if (gid >= GID_CNT) return; - GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; - GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; - GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; - GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + u32 X[STATE_CNT4]; + u32 T[STATE_CNT4]; - uint4 X[STATE_CNT4]; + GLOBAL_AS u32 *P = tmps[gid].P + (SALT_REPEAT * STATE_CNT4); - #ifdef IS_HIP - LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; - LOCAL_AS uint4 *T = T_s[lid]; - #else - uint4 T[STATE_CNT4]; - #endif + for (u32 z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - const u32 P_offset = SALT_REPEAT * STATE_CNT4; + scrypt_smix_loop (X, T, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, gid, lid, lsz, bid); - GLOBAL_AS uint4 *P = tmps[gid].P + P_offset; - - for (int z = 0; z < STATE_CNT4; z++) X[z] = P[z]; - - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf, gid, lid, lsz, bid); - - for (int z = 0; z < STATE_CNT4; z++) P[z] = X[z]; + for (u32 z = 0; z < STATE_CNT4; z++) P[z] = X[z]; } KERNEL_FQ void HC_ATTR_SEQ m29800_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -211,28 +181,29 @@ KERNEL_FQ void HC_ATTR_SEQ m29800_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) | ((w[j] << 8) & 0xff00ff00); } - scrypt_blockmix_out (tmps[gid].P, SCRYPT_CNT * 4); - sha256_hmac_ctx_t ctx; + u32 x[SCRYPT_CNT4]; - sha256_hmac_init_swap (&ctx, w, w_len); + for (u32 i = 0; i < SCRYPT_CNT4; i++) x[i] = tmps[gid].P[i]; - sha256_hmac_update_global_swap (&ctx, (GLOBAL_AS const u32 *) tmps[gid].P, SCRYPT_CNT * 4); + scrypt_blockmix_out (x, SCRYPT_SZ); - scrypt_pbkdf2_body (&ctx, tmps[gid].P, 16); + u32 out[8]; + + scrypt_pbkdf2_pp (w, w_len, x, SCRYPT_SZ, out, 32); // AES256-CBC decrypt u32 key[8]; - key[0] = tmps[gid].P[0].x; - key[1] = tmps[gid].P[0].y; - key[2] = tmps[gid].P[0].z; - key[3] = tmps[gid].P[0].w; - key[4] = tmps[gid].P[1].x; - key[5] = tmps[gid].P[1].y; - key[6] = tmps[gid].P[1].z; - key[7] = tmps[gid].P[1].w; + key[0] = out[0]; + key[1] = out[1]; + key[2] = out[2]; + key[3] = out[3]; + key[4] = out[4]; + key[5] = out[5]; + key[6] = out[6]; + key[7] = out[7]; #define KEYLEN 60 diff --git a/include/shared.h b/include/shared.h index bc27b49e0..2c7f801f6 100644 --- a/include/shared.h +++ b/include/shared.h @@ -117,4 +117,7 @@ char *file_to_buffer (const char *filename); bool check_file_suffix (const char *file, const char *suffix); bool remove_file_suffix (char *file, const char *suffix); +int suppress_stderr (void); +void restore_stderr (int saved_fd); + #endif // HC_SHARED_H diff --git a/src/backend.c b/src/backend.c index c69792622..51375a56a 100644 --- a/src/backend.c +++ b/src/backend.c @@ -486,22 +486,13 @@ static bool opencl_test_instruction (hashcat_ctx_t *hashcat_ctx, cl_context cont OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; #ifndef DEBUG - const int fd_stderr = fileno (stderr); - const int stderr_bak = dup (fd_stderr); - #ifdef _WIN - const int tmp = open ("NUL", O_WRONLY); - #else - const int tmp = open ("/dev/null", O_WRONLY); - #endif - dup2 (tmp, fd_stderr); - close (tmp); + int saved_stderr = suppress_stderr (); #endif const int CL_rc = ocl->clBuildProgram (program, 1, &device, NULL, NULL, NULL); #ifndef DEBUG - dup2 (stderr_bak, fd_stderr); - close (stderr_bak); + restore_stderr (saved_stderr); #endif if (CL_rc != CL_SUCCESS) @@ -8984,8 +8975,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p nvrtc_options[nvrtc_options_idx++] = "--std=c++14"; } - nvrtc_options[nvrtc_options_idx++] = "--restrict"; - nvrtc_options[nvrtc_options_idx++] = "--device-as-default-execution-space"; + //nvrtc_options[nvrtc_options_idx++] = "--restrict"; nvrtc_options[nvrtc_options_idx++] = "--gpu-architecture"; hc_asprintf (&nvrtc_options[nvrtc_options_idx++], "compute_%d", (device_param->sm_major * 10) + device_param->sm_minor); @@ -9243,6 +9233,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-architecture=%s", device_param->gcnArchName); + hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); // untested but it should work #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__) @@ -10598,6 +10589,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D XM2S(x)=#x "); build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D M2S(x)=XM2S(x) "); + build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D MAX_THREADS_PER_BLOCK=%d ", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); #if defined (__APPLE__) if (is_apple_silicon () == true) diff --git a/src/bridges/bridge_python_generic_hash_mp.c b/src/bridges/bridge_python_generic_hash_mp.c index f5d31aeab..4dcfe0ea5 100644 --- a/src/bridges/bridge_python_generic_hash_mp.c +++ b/src/bridges/bridge_python_generic_hash_mp.c @@ -195,9 +195,9 @@ typedef struct } python_interpreter_t; #if defined (_WIN) || defined (__CYGWIN__) || defined (__APPLE__) -static char *DEFAULT_SOURCE_FILENAME = "./Python/generic_hash_sp.py"; +static char *DEFAULT_SOURCE_FILENAME = "Python/generic_hash_sp.py"; #else -static char *DEFAULT_SOURCE_FILENAME = "./Python/generic_hash_mp.py"; +static char *DEFAULT_SOURCE_FILENAME = "Python/generic_hash_mp.py"; #endif const char *extract_module_name (const char *path) @@ -227,78 +227,6 @@ const char *extract_module_name (const char *path) return module_name; } -char *load_source (const char *filename) -{ - FILE *fp = fopen (filename, "r"); - - if (fp == NULL) - { - fprintf (stderr, "%s: %s\n", filename, strerror (errno)); - - return NULL; - } - - fseek (fp, 0, SEEK_END); - - const size_t size = ftell (fp); - - fseek (fp, 0, SEEK_SET); - - char *source = hcmalloc (size + 1); - - if (fread (source, 1, size, fp) != size) - { - fprintf (stderr, "%s: %s\n", filename, strerror (errno)); - - hcfree (source); - - return NULL; - } - - source[size] = 0; - - fclose (fp); - - return source; -} - -#if defined (_WIN) -#define DEVNULL "NUL" -#else -#define DEVNULL "/dev/null" -#endif - -static int suppress_stderr (void) -{ - int null_fd = open (DEVNULL, O_WRONLY); - - if (null_fd < 0) return -1; - - int saved_fd = dup (fileno (stderr)); - - if (saved_fd < 0) - { - close (null_fd); - - return -1; - } - - dup2 (null_fd, fileno (stderr)); - - close (null_fd); - - return saved_fd; -} - -static void restore_stderr (int saved_fd) -{ - if (saved_fd < 0) return; - - dup2 (saved_fd, fileno (stderr)); - - close (saved_fd); -} - static char *expand_pyenv_libpath (const char *prefix, const int maj, const int min) { char *out = NULL; @@ -793,9 +721,14 @@ void *platform_init (user_options_t *user_options) python_interpreter->source_filename = (user_options->bridge_parameter1 == NULL) ? DEFAULT_SOURCE_FILENAME : user_options->bridge_parameter1; - char *source = load_source (python_interpreter->source_filename); + char *source = file_to_buffer (python_interpreter->source_filename); - if (source == NULL) return NULL; + if (source == NULL) + { + fprintf (stderr, "ERROR: %s: %s\n\n", python_interpreter->source_filename, strerror (errno)); + + return NULL; + } PyObject *code = python->Py_CompileStringExFlags (source, python_interpreter->source_filename, Py_file_input, NULL, -1); diff --git a/src/bridges/bridge_python_generic_hash_sp.c b/src/bridges/bridge_python_generic_hash_sp.c index a53d4391a..3d2dbb994 100644 --- a/src/bridges/bridge_python_generic_hash_sp.c +++ b/src/bridges/bridge_python_generic_hash_sp.c @@ -192,7 +192,7 @@ typedef struct } python_interpreter_t; -static char *DEFAULT_SOURCE_FILENAME = "./Python/generic_hash_sp.py"; +static char *DEFAULT_SOURCE_FILENAME = "Python/generic_hash_sp.py"; const char *extract_module_name (const char *path) { @@ -221,78 +221,6 @@ const char *extract_module_name (const char *path) return module_name; } -char *load_source (const char *filename) -{ - FILE *fp = fopen (filename, "r"); - - if (fp == NULL) - { - fprintf (stderr, "%s: %s\n", filename, strerror (errno)); - - return NULL; - } - - fseek (fp, 0, SEEK_END); - - const size_t size = ftell (fp); - - fseek (fp, 0, SEEK_SET); - - char *source = hcmalloc (size + 1); - - if (fread (source, 1, size, fp) != size) - { - fprintf (stderr, "%s: %s\n", filename, strerror (errno)); - - hcfree (source); - - return NULL; - } - - source[size] = 0; - - fclose (fp); - - return source; -} - -#if defined (_WIN) -#define DEVNULL "NUL" -#else -#define DEVNULL "/dev/null" -#endif - -static int suppress_stderr (void) -{ - int null_fd = open (DEVNULL, O_WRONLY); - - if (null_fd < 0) return -1; - - int saved_fd = dup (fileno (stderr)); - - if (saved_fd < 0) - { - close (null_fd); - - return -1; - } - - dup2 (null_fd, fileno (stderr)); - - close (null_fd); - - return saved_fd; -} - -static void restore_stderr (int saved_fd) -{ - if (saved_fd < 0) return; - - dup2 (saved_fd, fileno (stderr)); - - close (saved_fd); -} - static char *expand_pyenv_libpath (const char *prefix, const int maj, const int min) { char *out = NULL; @@ -843,7 +771,7 @@ bool thread_init (MAYBE_UNUSED void *platform_context, MAYBE_UNUSED hc_device_pa python->Py_DecRef (path); python->Py_DecRef (sys); - char *source = load_source (python_interpreter->source_filename); + char *source = file_to_buffer (python_interpreter->source_filename); if (source == NULL) return NULL; @@ -1154,9 +1082,14 @@ const char *st_update_hash (MAYBE_UNUSED void *platform_context) python->Py_DecRef (path); python->Py_DecRef (sys); - char *source = load_source (python_interpreter->source_filename); + char *source = file_to_buffer (python_interpreter->source_filename); - if (source == NULL) return NULL; + if (source == NULL) + { + fprintf (stderr, "ERROR: %s: %s\n\n", python_interpreter->source_filename, strerror (errno)); + + return NULL; + } PyObject *code = python->Py_CompileStringExFlags (source, python_interpreter->source_filename, Py_file_input, NULL, -1); @@ -1218,7 +1151,7 @@ const char *st_update_pass (MAYBE_UNUSED void *platform_context) // this is ugly to load that entire thing just to get that one variable - char *source = load_source (python_interpreter->source_filename); + char *source = file_to_buffer (python_interpreter->source_filename); if (source == NULL) return NULL; diff --git a/src/modules/module_25000.c b/src/modules/module_25000.c index a3dd1872c..897d51791 100644 --- a/src/modules/module_25000.c +++ b/src/modules/module_25000.c @@ -23,8 +23,7 @@ static const char *HASH_NAME = "SNMPv3 HMAC-MD5-96/HMAC-SHA1-96"; static const u64 KERN_TYPE = 25000; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE - | OPTS_TYPE_PT_GENERATE_LE - | OPTS_TYPE_MP_MULTI_DISABLE; + | OPTS_TYPE_PT_GENERATE_LE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat1"; static const char *ST_HASH = "$SNMPv3$0$45889431$30818f0201033011020409242fc0020300ffe304010102010304383036041180001f88808106d566db57fd600000000002011002020118040a6d61747269785f4d4435040c0000000000000000000000000400303d041180001f88808106d566db57fd60000000000400a226020411f319300201000201003018301606082b06010201010200060a2b06010401bf0803020a$80001f88808106d566db57fd6000000000$1b37c3ea872731f922959e90"; @@ -118,6 +117,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of 64 @@ -337,11 +354,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = MODULE_DEFAULT; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_25100.c b/src/modules/module_25100.c index 24fd1ca74..57de16c79 100644 --- a/src/modules/module_25100.c +++ b/src/modules/module_25100.c @@ -23,8 +23,7 @@ static const char *HASH_NAME = "SNMPv3 HMAC-MD5-96"; static const u64 KERN_TYPE = 25100; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE - | OPTS_TYPE_PT_GENERATE_LE - | OPTS_TYPE_MP_MULTI_DISABLE; + | OPTS_TYPE_PT_GENERATE_LE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat1"; static const char *ST_HASH = "$SNMPv3$1$45889431$30818f0201033011020409242fc0020300ffe304010102010304383036041180001f88808106d566db57fd600000000002011002020118040a6d61747269785f4d4435040c0000000000000000000000000400303d041180001f88808106d566db57fd60000000000400a226020411f319300201000201003018301606082b06010201010200060a2b06010401bf0803020a$80001f88808106d566db57fd6000000000$1b37c3ea872731f922959e90"; @@ -114,6 +113,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of 64 @@ -318,11 +335,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = MODULE_DEFAULT; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_25200.c b/src/modules/module_25200.c index 22a671253..29674aa17 100644 --- a/src/modules/module_25200.c +++ b/src/modules/module_25200.c @@ -23,8 +23,7 @@ static const char *HASH_NAME = "SNMPv3 HMAC-SHA1-96"; static const u64 KERN_TYPE = 25200; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_STOCK_MODULE - | OPTS_TYPE_PT_GENERATE_LE - | OPTS_TYPE_MP_MULTI_DISABLE; + | OPTS_TYPE_PT_GENERATE_LE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat1"; static const char *ST_HASH = "$SNMPv3$2$45889431$30818f02010330110204371780f3020300ffe304010102010304383036041180001f88808106d566db57fd600000000002011002020118040a6d61747269785f534841040c0000000000000000000000000400303d041180001f88808106d566db57fd60000000000400a2260204073557d50201000201003018301606082b06010201010200060a2b06010401bf0803020a$80001f88808106d566db57fd6000000000$81f14f1930589f26f6755f6b"; @@ -114,6 +113,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of 64 @@ -329,11 +346,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = MODULE_DEFAULT; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_26700.c b/src/modules/module_26700.c index e802726f6..82132f35e 100644 --- a/src/modules/module_26700.c +++ b/src/modules/module_26700.c @@ -115,6 +115,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of 64 @@ -332,11 +350,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = MODULE_DEFAULT; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_26800.c b/src/modules/module_26800.c index 7363bedff..0c15fe0a0 100644 --- a/src/modules/module_26800.c +++ b/src/modules/module_26800.c @@ -115,6 +115,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of 64 @@ -340,11 +358,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = MODULE_DEFAULT; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_26900.c b/src/modules/module_26900.c index 5282933a6..975a97c83 100644 --- a/src/modules/module_26900.c +++ b/src/modules/module_26900.c @@ -149,6 +149,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of SNMPV3_MAX_PW_LENGTH @@ -383,11 +401,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = module_jit_build_options; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_27300.c b/src/modules/module_27300.c index 535f7bc36..2514d95a1 100644 --- a/src/modules/module_27300.c +++ b/src/modules/module_27300.c @@ -149,6 +149,24 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_accel_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_accel_max = 256; + + return kernel_accel_max; +} + +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // due to the large tmps structure + + const u32 kernel_threads_max = 32; + + return kernel_threads_max; +} + u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { // we need to fix iteration count to guarantee the loop count is a multiple of SNMPV3_MAX_PW_LENGTH @@ -379,11 +397,11 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook_size = MODULE_DEFAULT; module_ctx->module_jit_build_options = module_jit_build_options; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; - module_ctx->module_kernel_accel_max = MODULE_DEFAULT; + module_ctx->module_kernel_accel_max = module_kernel_accel_max; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/scrypt_common.c b/src/modules/scrypt_common.c index bbaa3b556..081d63f01 100644 --- a/src/modules/scrypt_common.c +++ b/src/modules/scrypt_common.c @@ -54,10 +54,10 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N == 0) ? hashes->st_salts_buf[0].scrypt_N : hashes->salts_buf[0].scrypt_N; const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r == 0) ? hashes->st_salts_buf[0].scrypt_r : hashes->salts_buf[0].scrypt_r; - const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p; + //const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p; const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)); - const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)); + //const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)); int lines_sz = 4096; char *lines_buf = hcmalloc (lines_sz); @@ -65,7 +65,7 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t const u32 device_processors = device_param->device_processors; - const u32 device_local_mem_size = device_param->device_local_mem_size; + //const u32 device_local_mem_size = device_param->device_local_mem_size; const u64 fixed_mem = (512 * 1024 * 1024); // some storage we need for pws[], tmps[], and others @@ -160,21 +160,6 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t break; } - - if (device_param->is_hip == true) - { - // we use some local memory to speed up things, so - // we need to make sure there's enough local memory available - - u64 state_per_accel_tmto = state_per_accel >> tmto; - - while (state_per_accel_tmto > device_local_mem_size) - { - tmto++; - - state_per_accel_tmto = state_per_accel >> tmto; - } - } } } @@ -301,7 +286,7 @@ char *scrypt_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconf scrypt_r, scrypt_p, tmto, - tmp_size / 16); + tmp_size / 4); return jit_build_options; } diff --git a/src/shared.c b/src/shared.c index 4661c6934..3a4a1cfe2 100644 --- a/src/shared.c +++ b/src/shared.c @@ -1582,3 +1582,40 @@ bool remove_file_suffix (char *file, const char *suffix) return true; } +#if defined (_WIN) +#define DEVNULL "NUL" +#else +#define DEVNULL "/dev/null" +#endif + +int suppress_stderr (void) +{ + int null_fd = open (DEVNULL, O_WRONLY); + + if (null_fd < 0) return -1; + + int saved_fd = dup (fileno (stderr)); + + if (saved_fd < 0) + { + close (null_fd); + + return -1; + } + + dup2 (null_fd, fileno (stderr)); + + close (null_fd); + + return saved_fd; +} + +void restore_stderr (int saved_fd) +{ + if (saved_fd < 0) return; + + dup2 (saved_fd, fileno (stderr)); + + close (saved_fd); +} + diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index a7a09ae9f..09e38928a 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -34,7 +34,7 @@ if ($cpu_benchmark == 1) } else { - system ("rocm-smi --resetprofile --resetclocks --resetfans"); + #system ("rocm-smi --resetprofile --resetclocks --resetfans"); system ("rocm-smi --setfan 100% --setperflevel high"); system ("nvidia-settings -a GPUPowerMizerMode=1 -a GPUFanControlState=1 -a GPUTargetFanSpeed=100");