diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl index 3064263a2..b74b3d9c6 100644 --- a/OpenCL/m08900-pure.cl +++ b/OpenCL/m08900-pure.cl @@ -128,6 +128,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v) DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) { + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4 / 2]; + + for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TT[dst_off + 0] = TI[src_off + 0]; + TT[dst_off + 1] = TI[src_off + 1]; + TT[dst_off + 2] = TI[src_off + 2]; + TT[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TI[dst_off + 0] = TI[src_off + 0]; + TI[dst_off + 1] = TI[src_off + 1]; + TI[dst_off + 2] = TI[src_off + 2]; + TI[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) + { + TI[dst_off + 0] = TT[src_off + 0]; + TI[dst_off + 1] = TT[src_off + 1]; + TI[dst_off + 2] = TT[src_off + 2]; + TI[dst_off + 3] = TT[src_off + 3]; + } + + #endif + uint4 R0 = TI[STATE_CNT4 - 4]; uint4 R1 = TI[STATE_CNT4 - 3]; uint4 R2 = TI[STATE_CNT4 - 2]; @@ -165,36 +195,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) TI[i + 2] = R2; TI[i + 3] = R3; } - - #if SCRYPT_R > 1 - - uint4 TT[STATE_CNT4 / 2]; - - for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TT[dst_off + 0] = TI[src_off + 0]; - TT[dst_off + 1] = TI[src_off + 1]; - TT[dst_off + 2] = TI[src_off + 2]; - TT[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TI[dst_off + 0] = TI[src_off + 0]; - TI[dst_off + 1] = TI[src_off + 1]; - TI[dst_off + 2] = TI[src_off + 2]; - TI[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) - { - TI[dst_off + 0] = TT[src_off + 0]; - TI[dst_off + 1] = TT[src_off + 1]; - TI[dst_off + 2] = TT[src_off + 2]; - TI[dst_off + 3] = TT[src_off + 3]; - } - - #endif } DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid) @@ -217,6 +217,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL case 3: V = V3; break; } + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4]; + + for (int z = 0; z < zSIZE; z++) TT[z] = X[z]; + + for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + #endif + for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; @@ -459,10 +483,14 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len); - for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4) { + for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8) + { uint4 X[4]; + const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0); + X[0] = tmps[gid].P[l + 0]; X[1] = tmps[gid].P[l + 1]; X[2] = tmps[gid].P[l + 2]; @@ -510,6 +538,7 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } } w0[0] = 1; diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl index e500b4f70..d435883ce 100644 --- a/OpenCL/m15700-pure.cl +++ b/OpenCL/m15700-pure.cl @@ -135,6 +135,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v) DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) { + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4 / 2]; + + for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TT[dst_off + 0] = TI[src_off + 0]; + TT[dst_off + 1] = TI[src_off + 1]; + TT[dst_off + 2] = TI[src_off + 2]; + TT[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TI[dst_off + 0] = TI[src_off + 0]; + TI[dst_off + 1] = TI[src_off + 1]; + TI[dst_off + 2] = TI[src_off + 2]; + TI[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) + { + TI[dst_off + 0] = TT[src_off + 0]; + TI[dst_off + 1] = TT[src_off + 1]; + TI[dst_off + 2] = TT[src_off + 2]; + TI[dst_off + 3] = TT[src_off + 3]; + } + + #endif + uint4 R0 = TI[STATE_CNT4 - 4]; uint4 R1 = TI[STATE_CNT4 - 3]; uint4 R2 = TI[STATE_CNT4 - 2]; @@ -172,36 +202,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) TI[i + 2] = R2; TI[i + 3] = R3; } - - #if SCRYPT_R > 1 - - uint4 TT[STATE_CNT4 / 2]; - - for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TT[dst_off + 0] = TI[src_off + 0]; - TT[dst_off + 1] = TI[src_off + 1]; - TT[dst_off + 2] = TI[src_off + 2]; - TT[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TI[dst_off + 0] = TI[src_off + 0]; - TI[dst_off + 1] = TI[src_off + 1]; - TI[dst_off + 2] = TI[src_off + 2]; - TI[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) - { - TI[dst_off + 0] = TT[src_off + 0]; - TI[dst_off + 1] = TT[src_off + 1]; - TI[dst_off + 2] = TT[src_off + 2]; - TI[dst_off + 3] = TT[src_off + 3]; - } - - #endif } DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid) @@ -224,6 +224,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL case 3: V = V3; break; } + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4]; + + for (int z = 0; z < zSIZE; z++) TT[z] = X[z]; + + for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + #endif + for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; @@ -595,10 +619,15 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len); - for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + + for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4) { + for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8) + { uint4 X[4]; + const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0); + X[0] = tmps[gid].P[l + 0]; X[1] = tmps[gid].P[l + 1]; X[2] = tmps[gid].P[l + 2]; @@ -646,6 +675,7 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } } w0[0] = 1; diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl index a29df1c03..303e5e334 100644 --- a/OpenCL/m22700-pure.cl +++ b/OpenCL/m22700-pure.cl @@ -176,6 +176,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v) DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) { + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4 / 2]; + + for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TT[dst_off + 0] = TI[src_off + 0]; + TT[dst_off + 1] = TI[src_off + 1]; + TT[dst_off + 2] = TI[src_off + 2]; + TT[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TI[dst_off + 0] = TI[src_off + 0]; + TI[dst_off + 1] = TI[src_off + 1]; + TI[dst_off + 2] = TI[src_off + 2]; + TI[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) + { + TI[dst_off + 0] = TT[src_off + 0]; + TI[dst_off + 1] = TT[src_off + 1]; + TI[dst_off + 2] = TT[src_off + 2]; + TI[dst_off + 3] = TT[src_off + 3]; + } + + #endif + uint4 R0 = TI[STATE_CNT4 - 4]; uint4 R1 = TI[STATE_CNT4 - 3]; uint4 R2 = TI[STATE_CNT4 - 2]; @@ -213,36 +243,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) TI[i + 2] = R2; TI[i + 3] = R3; } - - #if SCRYPT_R > 1 - - uint4 TT[STATE_CNT4 / 2]; - - for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TT[dst_off + 0] = TI[src_off + 0]; - TT[dst_off + 1] = TI[src_off + 1]; - TT[dst_off + 2] = TI[src_off + 2]; - TT[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TI[dst_off + 0] = TI[src_off + 0]; - TI[dst_off + 1] = TI[src_off + 1]; - TI[dst_off + 2] = TI[src_off + 2]; - TI[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) - { - TI[dst_off + 0] = TT[src_off + 0]; - TI[dst_off + 1] = TT[src_off + 1]; - TI[dst_off + 2] = TT[src_off + 2]; - TI[dst_off + 3] = TT[src_off + 3]; - } - - #endif } DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid) @@ -265,6 +265,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL case 3: V = V3; break; } + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4]; + + for (int z = 0; z < zSIZE; z++) TT[z] = X[z]; + + for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + #endif + for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; @@ -597,10 +621,14 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) u32 w2[4]; u32 w3[4]; - for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4) { + for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8) + { uint4 X[4]; + const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0); + X[0] = tmps[gid].P[l + 0]; X[1] = tmps[gid].P[l + 1]; X[2] = tmps[gid].P[l + 2]; @@ -648,6 +676,7 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } } w0[0] = 1; diff --git a/OpenCL/m27700-pure.cl b/OpenCL/m27700-pure.cl index c62dc90d6..d9bf11510 100644 --- a/OpenCL/m27700-pure.cl +++ b/OpenCL/m27700-pure.cl @@ -126,6 +126,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v) DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) { + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4 / 2]; + + for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TT[dst_off + 0] = TI[src_off + 0]; + TT[dst_off + 1] = TI[src_off + 1]; + TT[dst_off + 2] = TI[src_off + 2]; + TT[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TI[dst_off + 0] = TI[src_off + 0]; + TI[dst_off + 1] = TI[src_off + 1]; + TI[dst_off + 2] = TI[src_off + 2]; + TI[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) + { + TI[dst_off + 0] = TT[src_off + 0]; + TI[dst_off + 1] = TT[src_off + 1]; + TI[dst_off + 2] = TT[src_off + 2]; + TI[dst_off + 3] = TT[src_off + 3]; + } + + #endif + uint4 R0 = TI[STATE_CNT4 - 4]; uint4 R1 = TI[STATE_CNT4 - 3]; uint4 R2 = TI[STATE_CNT4 - 2]; @@ -163,36 +193,6 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) TI[i + 2] = R2; TI[i + 3] = R3; } - - #if SCRYPT_R > 1 - - uint4 TT[STATE_CNT4 / 2]; - - for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TT[dst_off + 0] = TI[src_off + 0]; - TT[dst_off + 1] = TI[src_off + 1]; - TT[dst_off + 2] = TI[src_off + 2]; - TT[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TI[dst_off + 0] = TI[src_off + 0]; - TI[dst_off + 1] = TI[src_off + 1]; - TI[dst_off + 2] = TI[src_off + 2]; - TI[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) - { - TI[dst_off + 0] = TT[src_off + 0]; - TI[dst_off + 1] = TT[src_off + 1]; - TI[dst_off + 2] = TT[src_off + 2]; - TI[dst_off + 3] = TT[src_off + 3]; - } - - #endif } DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid) @@ -215,6 +215,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL case 3: V = V3; break; } + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4]; + + for (int z = 0; z < zSIZE; z++) TT[z] = X[z]; + + for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + #endif + for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; @@ -549,10 +573,14 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) u32 w2[4]; u32 w3[4]; - for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4) { + for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8) + { uint4 X[4]; + const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0); + X[0] = tmps[gid].P[l + 0]; X[1] = tmps[gid].P[l + 1]; X[2] = tmps[gid].P[l + 2]; @@ -600,6 +628,7 @@ KERNEL_FQ void m27700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } } w0[0] = 1; diff --git a/OpenCL/m28200-pure.cl b/OpenCL/m28200-pure.cl index 2260e931b..58106a007 100644 --- a/OpenCL/m28200-pure.cl +++ b/OpenCL/m28200-pure.cl @@ -138,6 +138,36 @@ DECLSPEC uint4 hc_swap32_4 (uint4 v) DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) { + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4 / 2]; + + for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TT[dst_off + 0] = TI[src_off + 0]; + TT[dst_off + 1] = TI[src_off + 1]; + TT[dst_off + 2] = TI[src_off + 2]; + TT[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) + { + TI[dst_off + 0] = TI[src_off + 0]; + TI[dst_off + 1] = TI[src_off + 1]; + TI[dst_off + 2] = TI[src_off + 2]; + TI[dst_off + 3] = TI[src_off + 3]; + } + + for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) + { + TI[dst_off + 0] = TT[src_off + 0]; + TI[dst_off + 1] = TT[src_off + 1]; + TI[dst_off + 2] = TT[src_off + 2]; + TI[dst_off + 3] = TT[src_off + 3]; + } + + #endif + uint4 R0 = TI[STATE_CNT4 - 4]; uint4 R1 = TI[STATE_CNT4 - 3]; uint4 R2 = TI[STATE_CNT4 - 2]; @@ -175,38 +205,9 @@ DECLSPEC void salsa_r (PRIVATE_AS uint4 *TI) TI[i + 2] = R2; TI[i + 3] = R3; } - - #if SCRYPT_R > 1 - - uint4 TT[STATE_CNT4 / 2]; - - for (int dst_off = 0, src_off = 4; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TT[dst_off + 0] = TI[src_off + 0]; - TT[dst_off + 1] = TI[src_off + 1]; - TT[dst_off + 2] = TI[src_off + 2]; - TT[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = 4, src_off = 8; src_off < STATE_CNT4; dst_off += 4, src_off += 8) - { - TI[dst_off + 0] = TI[src_off + 0]; - TI[dst_off + 1] = TI[src_off + 1]; - TI[dst_off + 2] = TI[src_off + 2]; - TI[dst_off + 3] = TI[src_off + 3]; - } - - for (int dst_off = STATE_CNT4 / 2, src_off = 0; dst_off < STATE_CNT4; dst_off += 4, src_off += 4) - { - TI[dst_off + 0] = TT[src_off + 0]; - TI[dst_off + 1] = TT[src_off + 1]; - TI[dst_off + 2] = TT[src_off + 2]; - TI[dst_off + 3] = TT[src_off + 3]; - } - - #endif } + DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3, const u64 gid) { const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; @@ -227,6 +228,30 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL case 3: V = V3; break; } + #if SCRYPT_R > 1 + + uint4 TT[STATE_CNT4]; + + for (int z = 0; z < zSIZE; z++) TT[z] = X[z]; + + for (int dst_off = 8, src_off = 4; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + for (int dst_off = 4, src_off = zSIZE / 2; dst_off < zSIZE; dst_off += 8, src_off += 4) + { + X[dst_off + 0] = TT[src_off + 0]; + X[dst_off + 1] = TT[src_off + 1]; + X[dst_off + 2] = TT[src_off + 2]; + X[dst_off + 3] = TT[src_off + 3]; + } + + #endif + for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; @@ -517,10 +542,14 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t)) sha256_hmac_init_global_swap (&ctx, pws[gid].i, pws[gid].pw_len); - for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + for (u32 i = 0; i < SCRYPT_CNT4; i += STATE_CNT4) { + for (u32 j = 0; j < (STATE_CNT4 * 2); j += 8) + { uint4 X[4]; + const u32 l = i + j + ((j >= STATE_CNT4) ? (4 - STATE_CNT4) : 0); + X[0] = tmps[gid].P[l + 0]; X[1] = tmps[gid].P[l + 1]; X[2] = tmps[gid].P[l + 2]; @@ -568,6 +597,7 @@ KERNEL_FQ void m28200_comp (KERN_ATTR_TMPS_ESALT (exodus_tmp_t, exodus_t)) w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); + } } w0[0] = 1; diff --git a/include/common.h b/include/common.h index 8fdd49bcb..a744d532a 100644 --- a/include/common.h +++ b/include/common.h @@ -131,7 +131,7 @@ but this is needed for VS compiler which doesn't have inline keyword but has __i #define CPT_CACHE 0x20000 #define PARAMCNT 64 -#define DEVICES_MAX 128 +#define DEVICES_MAX 256 #define EXEC_CACHE 128 #define SPEED_CACHE 4096 #define SPEED_MAXAGE 4096 diff --git a/include/types.h b/include/types.h index df1e97118..e6ea946f9 100644 --- a/include/types.h +++ b/include/types.h @@ -1901,7 +1901,7 @@ typedef struct backend_ctx int opencl_devices_cnt; int opencl_devices_active; - u64 backend_devices_filter; + bool backend_devices_filter[DEVICES_MAX + 1]; hc_device_param_t *devices_param; diff --git a/src/backend.c b/src/backend.c index 6137d2767..f7c916e1d 100644 --- a/src/backend.c +++ b/src/backend.c @@ -157,13 +157,10 @@ static int backend_ctx_find_alias_devices (hashcat_ctx_t *hashcat_ctx) // show a warning for specifically listed devices if they are an alias - if (backend_ctx->backend_devices_filter != (u64) -1) + if (backend_ctx->backend_devices_filter[alias_device->device_id]) { - if (backend_ctx->backend_devices_filter & (1ULL << alias_device->device_id)) - { - event_log_warning (hashcat_ctx, "The device #%d specifically listed was skipped because it is an alias of device #%d", alias_device->device_id + 1, backend_device->device_id + 1); - event_log_warning (hashcat_ctx, NULL); - } + event_log_warning (hashcat_ctx, "The device #%d specifically listed was skipped because it is an alias of device #%d", alias_device->device_id + 1, backend_device->device_id + 1); + event_log_warning (hashcat_ctx, NULL); } } } @@ -273,9 +270,9 @@ static int ocl_check_dri (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx) return 0; } -static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *backend_devices, u64 *out) +static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char *backend_devices, bool *out) { - u64 backend_devices_filter = 0; + bool backend_devices_filter[DEVICES_MAX + 1] = {false}; if (backend_devices) { @@ -291,7 +288,7 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char { const int backend_device_id = (const int) strtol (next, NULL, 10); - if ((backend_device_id <= 0) || (backend_device_id >= 64)) + if ((backend_device_id <= 0) || (backend_device_id >= DEVICES_MAX)) { event_log_error (hashcat_ctx, "Invalid device_id %d specified.", backend_device_id); @@ -300,7 +297,7 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char return false; } - backend_devices_filter |= 1ULL << (backend_device_id - 1); + backend_devices_filter[backend_device_id - 1] = true; } while ((next = strtok_r ((char *) NULL, ",", &saveptr)) != NULL); @@ -308,10 +305,16 @@ static bool setup_backend_devices_filter (hashcat_ctx_t *hashcat_ctx, const char } else { - backend_devices_filter = -1ULL; + for (int i = 0; i <= DEVICES_MAX; i++) + { + backend_devices_filter[i] = true; + } } - *out = backend_devices_filter; + for (int i = 0; i <= DEVICES_MAX; i++) + { + out[i] = backend_devices_filter[i]; + } return true; } @@ -4613,11 +4616,11 @@ int backend_ctx_init (hashcat_ctx_t *hashcat_ctx) * Backend device selection */ - u64 backend_devices_filter; + bool backend_devices_filter[DEVICES_MAX + 1]; - if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, &backend_devices_filter) == false) return -1; + if (setup_backend_devices_filter (hashcat_ctx, user_options->backend_devices, backend_devices_filter) == false) return -1; - backend_ctx->backend_devices_filter = backend_devices_filter; + for (int i = 0; i <= DEVICES_MAX; i++) backend_ctx->backend_devices_filter[i] = backend_devices_filter[i]; /** * OpenCL device type selection @@ -5276,7 +5279,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // skipped - if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0) + if (!backend_ctx->backend_devices_filter[device_id]) { device_param->skipped = true; } @@ -5693,7 +5696,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // skipped - if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0) + if (!backend_ctx->backend_devices_filter[device_id]) { device_param->skipped = true; } @@ -6190,7 +6193,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // skipped - if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0) + if (!backend_ctx->backend_devices_filter[device_id]) { device_param->skipped = true; } @@ -6989,7 +6992,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // skipped - if ((backend_ctx->backend_devices_filter & (1ULL << device_id)) == 0) + if (!backend_ctx->backend_devices_filter[device_id]) { device_param->skipped = true; } @@ -7592,7 +7595,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) if (device_param->skipped == false) { - if (backend_ctx->backend_devices_filter == -1ULL) + if (backend_ctx->backend_devices_filter[DEVICES_MAX]) { if ((user_options->quiet == false) && (user_options->backend_info == 0)) { @@ -7605,7 +7608,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) } else { - if (backend_ctx->backend_devices_filter & (1ULL << device_param->device_id)) + if (backend_ctx->backend_devices_filter[device_param->device_id]) { // ok } @@ -7661,7 +7664,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt) - if (backend_ctx->backend_devices_cnt >= 64) + if (backend_ctx->backend_devices_cnt >= DEVICES_MAX) { event_log_error (hashcat_ctx, "Illegal use of the --backend-devices parameter because too many backend devices were found (%u).", backend_ctx->backend_devices_cnt); event_log_error (hashcat_ctx, "If possible, disable one of your backends to reduce the number of backend devices. For example \"--backend-ignore-cuda\" or \"--backend-ignore-opencl\" ."); @@ -7669,16 +7672,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) return -1; } - if (backend_ctx->backend_devices_filter != (u64) -1) + if (!backend_ctx->backend_devices_filter[DEVICES_MAX]) { const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt); - if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask) + for (int i = backend_ctx->backend_devices_cnt; i < DEVICES_MAX; i++) { - event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter."); - event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt); + if (backend_ctx->backend_devices_filter[i]) + { + event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter."); + event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt); - return -1; + return -1; + } } }