diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl index 9744618e0..b2f9a6f65 100644 --- a/OpenCL/m08900-pure.cl +++ b/OpenCL/m08900-pure.cl @@ -170,22 +170,18 @@ DECLSPEC void salsa_r (uint4 *TI) TT[idx_r2++] = R3; } - idx_r1 = 0; - idx_r2 = SCRYPT_R * 4; + idx_r2 = 0; - #ifdef _unroll - #pragma unroll - #endif for (int i = 0; i < SCRYPT_R; i++) { - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; } } -DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -208,55 +204,15 @@ DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } -DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_loop (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -279,26 +235,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -307,6 +243,8 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ const u32 km = k - (y * SCRYPT_TMTO); + uint4 T[STATE_CNT4]; + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); @@ -315,26 +253,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -405,6 +323,40 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t)) tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } + + for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + { + uint4 T[4]; + + T[0] = tmps[gid].P[l + 0]; + T[1] = tmps[gid].P[l + 1]; + T[2] = tmps[gid].P[l + 2]; + T[3] = tmps[gid].P[l + 3]; + + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + uint4 X[4]; + + #ifdef IS_CUDA + X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = make_uint4 (T[3].x, T[0].y, T[1].z, T[2].w); + #else + X[0] = (uint4) (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = (uint4) (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = (uint4) (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = (uint4) (T[3].x, T[0].y, T[1].z, T[2].w); + #endif + + tmps[gid].P[l + 0] = X[0]; + tmps[gid].P[l + 1] = X[1]; + tmps[gid].P[l + 2] = X[2]; + tmps[gid].P[l + 3] = X[3]; + } } KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -414,6 +366,7 @@ KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) */ const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -425,26 +378,20 @@ KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) { const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -454,21 +401,14 @@ KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_loop (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -497,35 +437,48 @@ KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - uint4 tmp; - - tmp = tmps[gid].P[l + 0]; + uint4 X[4]; - w0[0] = tmp.x; - w0[1] = tmp.y; - w0[2] = tmp.z; - w0[3] = tmp.w; + X[0] = tmps[gid].P[l + 0]; + X[1] = tmps[gid].P[l + 1]; + X[2] = tmps[gid].P[l + 2]; + X[3] = tmps[gid].P[l + 3]; - tmp = tmps[gid].P[l + 1]; + uint4 T[4]; - w1[0] = tmp.x; - w1[1] = tmp.y; - w1[2] = tmp.z; - w1[3] = tmp.w; - - tmp = tmps[gid].P[l + 2]; - - w2[0] = tmp.x; - w2[1] = tmp.y; - w2[2] = tmp.z; - w2[3] = tmp.w; - - tmp = tmps[gid].P[l + 3]; + #ifdef IS_CUDA + T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = make_uint4 (X[3].x, X[2].y, X[1].z, X[0].w); + #else + T[0] = (uint4) (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = (uint4) (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = (uint4) (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = (uint4) (X[3].x, X[2].y, X[1].z, X[0].w); + #endif - w3[0] = tmp.x; - w3[1] = tmp.y; - w3[2] = tmp.z; - w3[3] = tmp.w; + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + w0[0] = T[0].x; + w0[1] = T[0].y; + w0[2] = T[0].z; + w0[3] = T[0].w; + w1[0] = T[1].x; + w1[1] = T[1].y; + w1[2] = T[1].z; + w1[3] = T[1].w; + w2[0] = T[2].x; + w2[1] = T[2].y; + w2[2] = T[2].z; + w2[3] = T[2].w; + w3[0] = T[3].x; + w3[1] = T[3].y; + w3[2] = T[3].z; + w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); } diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl index dfb09edd4..373316024 100644 --- a/OpenCL/m15700-pure.cl +++ b/OpenCL/m15700-pure.cl @@ -177,22 +177,18 @@ DECLSPEC void salsa_r (uint4 *TI) TT[idx_r2++] = R3; } - idx_r1 = 0; - idx_r2 = SCRYPT_R * 4; + idx_r2 = 0; - #ifdef _unroll - #pragma unroll - #endif for (int i = 0; i < SCRYPT_R; i++) { - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; } } -DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -215,55 +211,15 @@ DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } -DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_loop (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -286,26 +242,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -314,6 +250,8 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ const u32 km = k - (y * SCRYPT_TMTO); + uint4 T[STATE_CNT4]; + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); @@ -322,26 +260,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } #ifndef KECCAK_ROUNDS @@ -541,15 +459,50 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } + + for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + { + uint4 T[4]; + + T[0] = tmps[gid].P[l + 0]; + T[1] = tmps[gid].P[l + 1]; + T[2] = tmps[gid].P[l + 2]; + T[3] = tmps[gid].P[l + 3]; + + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + uint4 X[4]; + + #ifdef IS_CUDA + X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = make_uint4 (T[3].x, T[0].y, T[1].z, T[2].w); + #else + X[0] = (uint4) (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = (uint4) (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = (uint4) (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = (uint4) (T[3].x, T[0].y, T[1].z, T[2].w); + #endif + + tmps[gid].P[l + 0] = X[0]; + tmps[gid].P[l + 1] = X[1]; + tmps[gid].P[l + 2] = X[2]; + tmps[gid].P[l + 3] = X[3]; + } } -KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) +KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) { /** * base */ const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -561,26 +514,20 @@ KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } -KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) +KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) { const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -590,21 +537,14 @@ KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_loop (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) @@ -633,35 +573,48 @@ KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - uint4 tmp; - - tmp = tmps[gid].P[l + 0]; + uint4 X[4]; - w0[0] = tmp.x; - w0[1] = tmp.y; - w0[2] = tmp.z; - w0[3] = tmp.w; + X[0] = tmps[gid].P[l + 0]; + X[1] = tmps[gid].P[l + 1]; + X[2] = tmps[gid].P[l + 2]; + X[3] = tmps[gid].P[l + 3]; - tmp = tmps[gid].P[l + 1]; + uint4 T[4]; - w1[0] = tmp.x; - w1[1] = tmp.y; - w1[2] = tmp.z; - w1[3] = tmp.w; - - tmp = tmps[gid].P[l + 2]; - - w2[0] = tmp.x; - w2[1] = tmp.y; - w2[2] = tmp.z; - w2[3] = tmp.w; - - tmp = tmps[gid].P[l + 3]; + #ifdef IS_CUDA + T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = make_uint4 (X[3].x, X[2].y, X[1].z, X[0].w); + #else + T[0] = (uint4) (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = (uint4) (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = (uint4) (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = (uint4) (X[3].x, X[2].y, X[1].z, X[0].w); + #endif - w3[0] = tmp.x; - w3[1] = tmp.y; - w3[2] = tmp.z; - w3[3] = tmp.w; + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + w0[0] = T[0].x; + w0[1] = T[0].y; + w0[2] = T[0].z; + w0[3] = T[0].w; + w1[0] = T[1].x; + w1[1] = T[1].y; + w1[2] = T[1].z; + w1[3] = T[1].w; + w2[0] = T[2].x; + w2[1] = T[2].y; + w2[2] = T[2].z; + w2[3] = T[2].w; + w3[0] = T[3].x; + w3[1] = T[3].y; + w3[2] = T[3].z; + w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); } diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl index 66e1285d9..dfdbbe041 100644 --- a/OpenCL/m22700-pure.cl +++ b/OpenCL/m22700-pure.cl @@ -218,22 +218,18 @@ DECLSPEC void salsa_r (uint4 *TI) TT[idx_r2++] = R3; } - idx_r1 = 0; - idx_r2 = SCRYPT_R * 4; + idx_r2 = 0; - #ifdef _unroll - #pragma unroll - #endif for (int i = 0; i < SCRYPT_R; i++) { - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; - TI[idx_r2++] = TT[idx_r1++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; + TI[idx_r1++] = TT[idx_r2++]; } } -DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -256,55 +252,15 @@ DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 y = 0; y < ySIZE; y++) { for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } -DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_loop (uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -327,26 +283,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ case 3: V = V3; break; } - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); - T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); - T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); - T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } - for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -355,6 +291,8 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ const u32 km = k - (y * SCRYPT_TMTO); + uint4 T[STATE_CNT4]; + for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; for (u32 i = 0; i < km; i++) salsa_r (T); @@ -363,26 +301,6 @@ DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_ salsa_r (X); } - - for (u32 i = 0; i < STATE_CNT4; i += 4) - { - #ifdef IS_CUDA - T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #else - T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); - T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); - T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); - T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); - #endif - - X[i + 0] = T[0]; - X[i + 1] = T[1]; - X[i + 2] = T[2]; - X[i + 3] = T[3]; - } } KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -493,6 +411,40 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t)) tmps[gid].P[k + 0] = tmp0; tmps[gid].P[k + 1] = tmp1; } + + for (u32 l = 0; l < SCRYPT_CNT4; l += 4) + { + uint4 T[4]; + + T[0] = tmps[gid].P[l + 0]; + T[1] = tmps[gid].P[l + 1]; + T[2] = tmps[gid].P[l + 2]; + T[3] = tmps[gid].P[l + 3]; + + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + uint4 X[4]; + + #ifdef IS_CUDA + X[0] = make_uint4 (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = make_uint4 (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = make_uint4 (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = make_uint4 (T[3].x, T[0].y, T[1].z, T[2].w); + #else + X[0] = (uint4) (T[0].x, T[1].y, T[2].z, T[3].w); + X[1] = (uint4) (T[1].x, T[2].y, T[3].z, T[0].w); + X[2] = (uint4) (T[2].x, T[3].y, T[0].z, T[1].w); + X[3] = (uint4) (T[3].x, T[0].y, T[1].z, T[2].w); + #endif + + tmps[gid].P[l + 0] = X[0]; + tmps[gid].P[l + 1] = X[1]; + tmps[gid].P[l + 2] = X[2]; + tmps[gid].P[l + 3] = X[3]; + } } KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -502,6 +454,7 @@ KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) */ const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -513,26 +466,20 @@ KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_init (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) { const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); if (gid >= gid_max) return; @@ -542,21 +489,14 @@ KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; uint4 X[STATE_CNT4]; - uint4 T[STATE_CNT4]; const u32 P_offset = salt_repeat * STATE_CNT4; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = tmps[gid].P[P_offset + z]; - scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + scrypt_smix_loop (X, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = X[z]; } KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) @@ -665,35 +605,48 @@ KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) for (u32 l = 0; l < SCRYPT_CNT4; l += 4) { - uint4 tmp; - - tmp = tmps[gid].P[l + 0]; + uint4 X[4]; - w0[0] = tmp.x; - w0[1] = tmp.y; - w0[2] = tmp.z; - w0[3] = tmp.w; + X[0] = tmps[gid].P[l + 0]; + X[1] = tmps[gid].P[l + 1]; + X[2] = tmps[gid].P[l + 2]; + X[3] = tmps[gid].P[l + 3]; - tmp = tmps[gid].P[l + 1]; + uint4 T[4]; - w1[0] = tmp.x; - w1[1] = tmp.y; - w1[2] = tmp.z; - w1[3] = tmp.w; - - tmp = tmps[gid].P[l + 2]; - - w2[0] = tmp.x; - w2[1] = tmp.y; - w2[2] = tmp.z; - w2[3] = tmp.w; - - tmp = tmps[gid].P[l + 3]; + #ifdef IS_CUDA + T[0] = make_uint4 (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = make_uint4 (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = make_uint4 (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = make_uint4 (X[3].x, X[2].y, X[1].z, X[0].w); + #else + T[0] = (uint4) (X[0].x, X[3].y, X[2].z, X[1].w); + T[1] = (uint4) (X[1].x, X[0].y, X[3].z, X[2].w); + T[2] = (uint4) (X[2].x, X[1].y, X[0].z, X[3].w); + T[3] = (uint4) (X[3].x, X[2].y, X[1].z, X[0].w); + #endif - w3[0] = tmp.x; - w3[1] = tmp.y; - w3[2] = tmp.z; - w3[3] = tmp.w; + T[0] = hc_swap32_4 (T[0]); + T[1] = hc_swap32_4 (T[1]); + T[2] = hc_swap32_4 (T[2]); + T[3] = hc_swap32_4 (T[3]); + + w0[0] = T[0].x; + w0[1] = T[0].y; + w0[2] = T[0].z; + w0[3] = T[0].w; + w1[0] = T[1].x; + w1[1] = T[1].y; + w1[2] = T[1].z; + w1[3] = T[1].w; + w2[0] = T[2].x; + w2[1] = T[2].y; + w2[2] = T[2].z; + w2[3] = T[2].w; + w3[0] = T[3].x; + w3[1] = T[3].y; + w3[2] = T[3].z; + w3[3] = T[3].w; sha256_hmac_update_64 (&ctx, w0, w1, w2, w3, 64); } diff --git a/hashcat.hctune b/hashcat.hctune index 4f14768c2..c8ef4f45c 100644 --- a/hashcat.hctune +++ b/hashcat.hctune @@ -370,12 +370,13 @@ GeForce_GTX_TITAN 3 9900 2 A ## DEVICE_TYPE_CPU * 8900 1 N A -DEVICE_TYPE_GPU * 8900 1 N A DEVICE_TYPE_CPU * 9300 1 N A -DEVICE_TYPE_GPU * 9300 1 N A DEVICE_TYPE_CPU * 15700 1 N A -DEVICE_TYPE_GPU * 15700 1 1 A DEVICE_TYPE_CPU * 22700 1 N A + +DEVICE_TYPE_GPU * 8900 1 N A +DEVICE_TYPE_GPU * 9300 1 N A +DEVICE_TYPE_GPU * 15700 1 1 A DEVICE_TYPE_GPU * 22700 1 N A ## Here's an example of how to manually tune SCRYPT algorithm kernels for your hardware. @@ -399,7 +400,7 @@ DEVICE_TYPE_GPU * 22700 1 N ## 3. Artificial multiplier (--kernel-accel aka -n) ## ## In order to find these values: -## +## ## 1. On startup Hashcat will show: * Device #1: GeForce GTX 980, 3963/4043 MB, 16MCU. The 16 MCU is the number of compute units on that device. ## 2. Native thread counts are fixed values: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefronts), GPU-NVIDIA=32 (warps) ## @@ -412,12 +413,12 @@ DEVICE_TYPE_GPU * 22700 1 N ## ## How do we deal with this? This is where SCRYPT TMTO(time-memory trde off) kicks in. The SCRYPT algorithm is designed in such a way that we ## can pre-compute that 16MB buffer from a self-choosen offset. Details on how this actually works are not important for this process. -## -## What's relevant to us is that we can halve the buffer size, but we pay with twice the computation time. +## +## What's relevant to us is that we can halve the buffer size, but we pay with twice the computation time. ## We can repeat this as often as we want. That's why it's a trade-off. ## ## This mechanic can be manually set using --scrypt-tmto on the commandline, but this is not the best way. -## +## ## Back to our problem. We need 8GB of memory but have only ~4GB. ## It's not a full 4GB. The OS needs some of it and Hashcat needs some of it to store password candidates and other things. ## If you run a headless server it should be safe to subtract a fixed value of 200MB from whatever you have in your GPU. @@ -426,7 +427,7 @@ DEVICE_TYPE_GPU * 22700 1 N ## ## (8GB >> 0) = 8GB < 3.8GB = No, Does not fit ## (8GB >> 1) = 4GB < 3.8GB = No, Does not fit -## (8GB >> 2) = 2GB < 3.8GB = Yes! +## (8GB >> 2) = 2GB < 3.8GB = Yes! ## ## This process is automated in Hashcat, but it is important to understand what's happening here. ## Because of the light overhead from the OS and Hashcat, we pay a very high price. @@ -440,7 +441,7 @@ DEVICE_TYPE_GPU * 22700 1 N ## Therefore, we do not need to increase the TMTO by another step to fit in VRAM. ## ## If we cut down our 16 MCU to only 15 MCU or 14 MCU using --kernel-accel(-n), we end up with: -## +## ## 16 * 32 * 16777216 = 8589934592 / 2 = 4294967296 = 4.00GB < 3.80GB = Nope, next ## 15 * 32 * 16777216 = 8053063680 / 2 = 4026531840 = 3.84GB < 3.80GB = Nope, next ## 14 * 32 * 16777216 = 7516192768 / 2 = 3758096384 = 3.58GB < 3.80GB = Yes! @@ -459,19 +460,24 @@ DEVICE_TYPE_GPU * 22700 1 N ## On my GTX980, this improves the performance from 201 H/s to 255 H/s. ## Again, there's no need to control this with --scrypt-tmto. Hashcat will realize it has to increase the TMTO again. ## -## All together, you can control all of this by using the -n parameter in the command line. +## All together, you can control all of this by using the -n parameter in the command line. ## This is not ideal in a production environment because you must use the --force flag. ## The best way to set this is by using this Hashcat.hctune file to store it. This avoids the need to bypass any warnings. ## -## Find the ideal -n value, then store it here along with the proper compute device name. +## Find the ideal -n value, then store it here along with the proper compute device name. ## Formatting guidelines are availabe at the top of this document. GeForce_GTX_980 * 8900 1 28 A GeForce_GTX_980 * 9300 1 128 A -GeForce_GTX_980 * 15700 1 1 A +GeForce_GTX_980 * 15700 1 2 A GeForce_GTX_980 * 22700 1 28 A -GeForce_RTX_2080_Ti * 8900 1 N A +GeForce_RTX_2080_Ti * 8900 1 38 A GeForce_RTX_2080_Ti * 9300 1 544 A -GeForce_RTX_2080_Ti * 15700 1 4 A -GeForce_RTX_2080_Ti * 22700 1 N A +GeForce_RTX_2080_Ti * 15700 1 8 A +GeForce_RTX_2080_Ti * 22700 1 38 A + +gfx900 * 8900 1 28 A +gfx900 * 9300 1 384 A +gfx900 * 15700 1 6 A +gfx900 * 22700 1 28 A diff --git a/src/backend.c b/src/backend.c index 79c4944c9..a5e33c236 100644 --- a/src/backend.c +++ b/src/backend.c @@ -8381,6 +8381,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->size_st_salts = size_st_salts; device_param->size_st_esalts = size_st_esalts; + // extra buffer + u64 size_extra_buffer = 4; if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT) diff --git a/src/modules/module_08900.c b/src/modules/module_08900.c index 94621e983..fe2ea77d4 100644 --- a/src/modules/module_08900.c +++ b/src/modules/module_08900.c @@ -250,24 +250,9 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY const u64 tmp_size = 128ULL * scrypt_r * scrypt_p; - char *unroll = ""; - - // NVIDIA GPU - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) - { - unroll = "-D _unroll"; - } - - // ROCM - if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true)) - { - unroll = "-D _unroll"; - } - char *jit_build_options = NULL; - hc_asprintf (&jit_build_options, "%s -DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, - unroll, + hc_asprintf (&jit_build_options, "-DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, hashes->salts_buf[0].scrypt_N, hashes->salts_buf[0].scrypt_r, hashes->salts_buf[0].scrypt_p, diff --git a/src/modules/module_09300.c b/src/modules/module_09300.c index 2fc876df2..fbf5a6064 100644 --- a/src/modules/module_09300.c +++ b/src/modules/module_09300.c @@ -250,24 +250,9 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY const u64 tmp_size = 128ULL * scrypt_r * scrypt_p; - char *unroll = ""; - - // NVIDIA GPU - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) - { - unroll = "-D _unroll"; - } - - // ROCM - if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true)) - { - unroll = "-D _unroll"; - } - char *jit_build_options = NULL; - hc_asprintf (&jit_build_options, "%s -DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, - unroll, + hc_asprintf (&jit_build_options, "-DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, hashes->salts_buf[0].scrypt_N, hashes->salts_buf[0].scrypt_r, hashes->salts_buf[0].scrypt_p, diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c index bcf314546..fea92056d 100644 --- a/src/modules/module_15700.c +++ b/src/modules/module_15700.c @@ -23,7 +23,6 @@ static const u64 KERN_TYPE = 15700; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_MP_MULTI_DISABLE - | OPTS_TYPE_NATIVE_THREADS | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_SELF_TEST_DISABLE | OPTS_TYPE_ST_HEX; @@ -73,6 +72,13 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } +u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + const u32 kernel_threads_max = 4; + + return kernel_threads_max; +} + u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const u64 esalt_size = (const u64) sizeof (ethereum_scrypt_t); @@ -265,24 +271,9 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY const u64 tmp_size = 128ULL * scrypt_r * scrypt_p; - char *unroll = ""; - - // NVIDIA GPU - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) - { - unroll = "-D _unroll"; - } - - // ROCM - if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true)) - { - unroll = "-D _unroll"; - } - char *jit_build_options = NULL; - hc_asprintf (&jit_build_options, "%s -DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, - unroll, + hc_asprintf (&jit_build_options, "-DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, hashes->salts_buf[0].scrypt_N, hashes->salts_buf[0].scrypt_r, hashes->salts_buf[0].scrypt_p, @@ -507,7 +498,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_max = module_kernel_threads_max; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_22700.c b/src/modules/module_22700.c index 46ffb9542..5842118f3 100644 --- a/src/modules/module_22700.c +++ b/src/modules/module_22700.c @@ -251,24 +251,9 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY const u64 tmp_size = 128ULL * scrypt_r * scrypt_p; - char *unroll = ""; - - // NVIDIA GPU - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) - { - unroll = "-D _unroll"; - } - - // ROCM - if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true)) - { - unroll = "-D _unroll"; - } - char *jit_build_options = NULL; - hc_asprintf (&jit_build_options, "%s -DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, - unroll, + hc_asprintf (&jit_build_options, "-DSCRYPT_N=%u -DSCRYPT_R=%u -DSCRYPT_P=%u -DSCRYPT_TMTO=%" PRIu64 " -DSCRYPT_TMP_ELEM=%" PRIu64, hashes->salts_buf[0].scrypt_N, hashes->salts_buf[0].scrypt_r, hashes->salts_buf[0].scrypt_p,