diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h index 9b3437326..86e3b7e7f 100644 --- a/OpenCL/inc_common.h +++ b/OpenCL/inc_common.h @@ -62,6 +62,7 @@ MAYBE_UNUSED const u32 digests_cnt, \ MAYBE_UNUSED const u32 digests_offset_host, \ MAYBE_UNUSED const u32 combs_mode, \ + MAYBE_UNUSED const u32 salt_repeat, \ MAYBE_UNUSED const u64 pws_pos, \ MAYBE_UNUSED const u64 gid_max #else @@ -100,6 +101,7 @@ MAYBE_UNUSED const u32 digests_cnt, \ MAYBE_UNUSED const u32 digests_offset_host, \ MAYBE_UNUSED const u32 combs_mode, \ + MAYBE_UNUSED const u32 salt_repeat, \ MAYBE_UNUSED const u64 pws_pos, \ MAYBE_UNUSED const u64 gid_max #endif diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h index 9a5173c54..a6b9ea85e 100644 --- a/OpenCL/inc_types.h +++ b/OpenCL/inc_types.h @@ -1642,6 +1642,7 @@ typedef struct salt u32 salt_iter; u32 salt_iter2; u32 salt_sign[2]; + u32 salt_repeats; u32 orig_pos; diff --git a/OpenCL/m08900-pure.cl b/OpenCL/m08900-pure.cl index cb0077e17..ccae9bda7 100644 --- a/OpenCL/m08900-pure.cl +++ b/OpenCL/m08900-pure.cl @@ -170,14 +170,16 @@ DECLSPEC void salsa_r (uint4 *TI) TO[idx_r2++] = R3; } + #ifdef _unroll #pragma unroll + #endif for (int i = 0; i < STATE_CNT4; i++) { TI[i] = TO[i]; } } -DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -200,9 +202,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui case 3: V = V3; break; } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -230,7 +229,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - for (u32 i = 0; i < SCRYPT_N; i++) + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } +} + +DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +{ + #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) + #define CO Coord(xd4,y,z) + + const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; + const u32 zSIZE = STATE_CNT4; + + const u32 x = get_global_id (0); + + const u32 xd4 = x / 4; + const u32 xm4 = x & 3; + + GLOBAL_AS uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } + + for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -247,9 +310,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui salsa_r (X); } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -341,6 +401,41 @@ KERNEL_FQ void m08900_init (KERN_ATTR_TMPS (scrypt_tmp_t)) } } +KERNEL_FQ void m08900_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + // SCRYPT part, init V + + GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; + GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; + GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; + GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + + uint4 X[STATE_CNT4]; + uint4 T[STATE_CNT4]; + + const u32 P_offset = salt_repeat * STATE_CNT4; + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + + scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); +} + KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) { const u64 gid = get_global_id (0); @@ -355,28 +450,19 @@ KERNEL_FQ void m08900_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) uint4 X[STATE_CNT4]; uint4 T[STATE_CNT4]; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]); - - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + const u32 P_offset = salt_repeat * STATE_CNT4; #ifdef _unroll #pragma unroll #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); - #if SCRYPT_P >= 1 - for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4) - { - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]); + scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]); - } + #ifdef _unroll + #pragma unroll #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); } KERNEL_FQ void m08900_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) diff --git a/OpenCL/m15700-pure.cl b/OpenCL/m15700-pure.cl index c3e32fae9..d6b5d251f 100644 --- a/OpenCL/m15700-pure.cl +++ b/OpenCL/m15700-pure.cl @@ -184,7 +184,7 @@ DECLSPEC void salsa_r (uint4 *TI) } } -DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -207,9 +207,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui case 3: V = V3; break; } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -237,7 +234,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - for (u32 i = 0; i < SCRYPT_N; i++) + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } +} + +DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +{ + #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) + #define CO Coord(xd4,y,z) + + const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; + const u32 zSIZE = STATE_CNT4; + + const u32 x = get_global_id (0); + + const u32 xd4 = x / 4; + const u32 xm4 = x & 3; + + GLOBAL_AS uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } + + for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -254,9 +315,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui salsa_r (X); } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -477,6 +535,41 @@ KERNEL_FQ void m15700_init (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ } } +KERNEL_FQ void m15700_loop_prepare (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + // SCRYPT part, init V + + GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; + GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; + GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; + GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + + uint4 X[STATE_CNT4]; + uint4 T[STATE_CNT4]; + + const u32 P_offset = salt_repeat * STATE_CNT4; + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + + scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); +} + KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) { const u64 gid = get_global_id (0); @@ -491,28 +584,19 @@ KERNEL_FQ void m15700_loop (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_ uint4 X[STATE_CNT4]; uint4 T[STATE_CNT4]; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]); - - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + const u32 P_offset = salt_repeat * STATE_CNT4; #ifdef _unroll #pragma unroll #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); - #if SCRYPT_P >= 1 - for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4) - { - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]); + scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]); - } + #ifdef _unroll + #pragma unroll #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); } KERNEL_FQ void m15700_comp (KERN_ATTR_TMPS_ESALT (scrypt_tmp_t, ethereum_scrypt_t)) diff --git a/OpenCL/m22700-pure.cl b/OpenCL/m22700-pure.cl index 0d0b50763..c9fb70d0e 100644 --- a/OpenCL/m22700-pure.cl +++ b/OpenCL/m22700-pure.cl @@ -225,7 +225,7 @@ DECLSPEC void salsa_r (uint4 *TI) } } -DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +DECLSPEC void scrypt_smix_init (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) { #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) #define CO Coord(xd4,y,z) @@ -248,9 +248,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui case 3: V = V3; break; } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -278,7 +275,71 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui for (u32 i = 0; i < SCRYPT_TMTO; i++) salsa_r (X); } - for (u32 i = 0; i < SCRYPT_N; i++) + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 3].y, X[i + 2].z, X[i + 1].w); + T[1] = (uint4) (X[i + 1].x, X[i + 0].y, X[i + 3].z, X[i + 2].w); + T[2] = (uint4) (X[i + 2].x, X[i + 1].y, X[i + 0].z, X[i + 3].w); + T[3] = (uint4) (X[i + 3].x, X[i + 2].y, X[i + 1].z, X[i + 0].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } +} + +DECLSPEC void scrypt_smix_loop (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS uint4 *V1, GLOBAL_AS uint4 *V2, GLOBAL_AS uint4 *V3) +{ + #define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) + #define CO Coord(xd4,y,z) + + const u32 ySIZE = SCRYPT_N / SCRYPT_TMTO; + const u32 zSIZE = STATE_CNT4; + + const u32 x = get_global_id (0); + + const u32 xd4 = x / 4; + const u32 xm4 = x & 3; + + GLOBAL_AS uint4 *V; + + switch (xm4) + { + case 0: V = V0; break; + case 1: V = V1; break; + case 2: V = V2; break; + case 3: V = V3; break; + } + + for (u32 i = 0; i < STATE_CNT4; i += 4) + { + #ifdef IS_CUDA + T[0] = make_uint4 (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = make_uint4 (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = make_uint4 (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = make_uint4 (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #else + T[0] = (uint4) (X[i + 0].x, X[i + 1].y, X[i + 2].z, X[i + 3].w); + T[1] = (uint4) (X[i + 1].x, X[i + 2].y, X[i + 3].z, X[i + 0].w); + T[2] = (uint4) (X[i + 2].x, X[i + 3].y, X[i + 0].z, X[i + 1].w); + T[3] = (uint4) (X[i + 3].x, X[i + 0].y, X[i + 1].z, X[i + 2].w); + #endif + + X[i + 0] = T[0]; + X[i + 1] = T[1]; + X[i + 2] = T[2]; + X[i + 3] = T[3]; + } + + for (u32 N_pos = 0; N_pos < 1024; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); @@ -295,9 +356,6 @@ DECLSPEC void scrypt_smix (uint4 *X, uint4 *T, GLOBAL_AS uint4 *V0, GLOBAL_AS ui salsa_r (X); } - #ifdef _unroll - #pragma unroll - #endif for (u32 i = 0; i < STATE_CNT4; i += 4) { #ifdef IS_CUDA @@ -429,6 +487,41 @@ KERNEL_FQ void m22700_init (KERN_ATTR_TMPS (scrypt_tmp_t)) } } +KERNEL_FQ void m22700_loop_prepare (KERN_ATTR_TMPS (scrypt_tmp_t)) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + // SCRYPT part, init V + + GLOBAL_AS uint4 *d_scrypt0_buf = (GLOBAL_AS uint4 *) d_extra0_buf; + GLOBAL_AS uint4 *d_scrypt1_buf = (GLOBAL_AS uint4 *) d_extra1_buf; + GLOBAL_AS uint4 *d_scrypt2_buf = (GLOBAL_AS uint4 *) d_extra2_buf; + GLOBAL_AS uint4 *d_scrypt3_buf = (GLOBAL_AS uint4 *) d_extra3_buf; + + uint4 X[STATE_CNT4]; + uint4 T[STATE_CNT4]; + + const u32 P_offset = salt_repeat * STATE_CNT4; + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); + + scrypt_smix_init (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + + #ifdef _unroll + #pragma unroll + #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); +} + KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) { const u64 gid = get_global_id (0); @@ -443,28 +536,19 @@ KERNEL_FQ void m22700_loop (KERN_ATTR_TMPS (scrypt_tmp_t)) uint4 X[STATE_CNT4]; uint4 T[STATE_CNT4]; - #ifdef _unroll - #pragma unroll - #endif - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[z]); - - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); + const u32 P_offset = salt_repeat * STATE_CNT4; #ifdef _unroll #pragma unroll #endif - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[z] = hc_swap32_4 (X[z]); + for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[P_offset + z]); - #if SCRYPT_P >= 1 - for (int i = STATE_CNT4; i < SCRYPT_CNT4; i += STATE_CNT4) - { - for (int z = 0; z < STATE_CNT4; z++) X[z] = hc_swap32_4 (tmps[gid].P[i + z]); + scrypt_smix_loop (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - scrypt_smix (X, T, d_scrypt0_buf, d_scrypt1_buf, d_scrypt2_buf, d_scrypt3_buf); - - for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[i + z] = hc_swap32_4 (X[z]); - } + #ifdef _unroll + #pragma unroll #endif + for (int z = 0; z < STATE_CNT4; z++) tmps[gid].P[P_offset + z] = hc_swap32_4 (X[z]); } KERNEL_FQ void m22700_comp (KERN_ATTR_TMPS (scrypt_tmp_t)) diff --git a/docs/changes.txt b/docs/changes.txt index f0adef031..f58707fd5 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -62,6 +62,7 @@ - OpenCL Runtime: Workaround JiT compiler deadlock on NVIDIA driver >= 465.89 - RAR3 Kernels: Improved loop code, improving performance by 23% - Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices +- Scrypt Kernels: Reduced kernel wait times by making it a true split kernel where iteration count = N value ## ## Technical diff --git a/hashcat.hctune b/hashcat.hctune index ee3446be1..4f14768c2 100644 --- a/hashcat.hctune +++ b/hashcat.hctune @@ -369,14 +369,14 @@ GeForce_GTX_TITAN 3 9900 2 A ## SCRYPT ## -DEVICE_TYPE_CPU * 8900 1 N 1 -DEVICE_TYPE_GPU * 8900 1 N 1 -DEVICE_TYPE_CPU * 9300 1 N 1 -DEVICE_TYPE_GPU * 9300 1 N 1 -DEVICE_TYPE_CPU * 15700 1 N 1 -DEVICE_TYPE_GPU * 15700 1 1 1 -DEVICE_TYPE_CPU * 22700 1 N 1 -DEVICE_TYPE_GPU * 22700 1 N 1 +DEVICE_TYPE_CPU * 8900 1 N A +DEVICE_TYPE_GPU * 8900 1 N A +DEVICE_TYPE_CPU * 9300 1 N A +DEVICE_TYPE_GPU * 9300 1 N A +DEVICE_TYPE_CPU * 15700 1 N A +DEVICE_TYPE_GPU * 15700 1 1 A +DEVICE_TYPE_CPU * 22700 1 N A +DEVICE_TYPE_GPU * 22700 1 N A ## Here's an example of how to manually tune SCRYPT algorithm kernels for your hardware. ## Manually tuning the GPU will yield increased performance. There is typically no noticeable change to CPU performance. @@ -466,12 +466,12 @@ DEVICE_TYPE_GPU * 22700 1 N ## Find the ideal -n value, then store it here along with the proper compute device name. ## Formatting guidelines are availabe at the top of this document. -GeForce_GTX_980 * 8900 1 28 1 -GeForce_GTX_980 * 9300 1 128 1 -GeForce_GTX_980 * 15700 1 1 1 -GeForce_GTX_980 * 22700 1 28 1 +GeForce_GTX_980 * 8900 1 28 A +GeForce_GTX_980 * 9300 1 128 A +GeForce_GTX_980 * 15700 1 1 A +GeForce_GTX_980 * 22700 1 28 A -GeForce_RTX_2080_Ti * 8900 1 N 1 -GeForce_RTX_2080_Ti * 9300 1 544 1 -GeForce_RTX_2080_Ti * 15700 1 4 1 -GeForce_RTX_2080_Ti * 22700 1 N 1 +GeForce_RTX_2080_Ti * 8900 1 N A +GeForce_RTX_2080_Ti * 9300 1 544 A +GeForce_RTX_2080_Ti * 15700 1 4 A +GeForce_RTX_2080_Ti * 22700 1 N A diff --git a/include/types.h b/include/types.h index e3a31a643..29215f1f8 100644 --- a/include/types.h +++ b/include/types.h @@ -257,12 +257,14 @@ typedef enum kern_run { KERN_RUN_1 = 1000, KERN_RUN_12 = 1500, + KERN_RUN_2P = 1999, KERN_RUN_2 = 2000, KERN_RUN_2E = 2001, KERN_RUN_23 = 2500, KERN_RUN_3 = 3000, KERN_RUN_4 = 4000, KERN_RUN_INIT2 = 5000, + KERN_RUN_LOOP2P = 5999, KERN_RUN_LOOP2 = 6000, KERN_RUN_AUX1 = 7001, KERN_RUN_AUX2 = 7002, @@ -412,30 +414,33 @@ typedef enum opts_type OPTS_TYPE_ST_BASE64 = (1ULL << 26), OPTS_TYPE_HASH_COPY = (1ULL << 28), OPTS_TYPE_HASH_SPLIT = (1ULL << 29), - OPTS_TYPE_LOOP_EXTENDED = (1ULL << 30), // a kernel which is called each time normal _loop kernel finished. + OPTS_TYPE_LOOP_PREPARE = (1ULL << 30), // a kernel which is called each time before _loop kernel started. + // like a hook12 kernel but without extra buffers. + OPTS_TYPE_LOOP_EXTENDED = (1ULL << 31), // a kernel which is called each time normal _loop kernel finished. // but unlike a hook kernel this kernel is called for every _loop iteration offset - OPTS_TYPE_HOOK12 = (1ULL << 31), - OPTS_TYPE_HOOK23 = (1ULL << 32), - OPTS_TYPE_INIT2 = (1ULL << 33), - OPTS_TYPE_LOOP2 = (1ULL << 34), - OPTS_TYPE_AUX1 = (1ULL << 35), - OPTS_TYPE_AUX2 = (1ULL << 36), - OPTS_TYPE_AUX3 = (1ULL << 37), - OPTS_TYPE_AUX4 = (1ULL << 38), - OPTS_TYPE_BINARY_HASHFILE = (1ULL << 39), + OPTS_TYPE_HOOK12 = (1ULL << 32), + OPTS_TYPE_HOOK23 = (1ULL << 33), + OPTS_TYPE_INIT2 = (1ULL << 34), + OPTS_TYPE_LOOP2_PREPARE = (1ULL << 35), // same as OPTS_TYPE_LOOP_PREPARE but for loop2 kernel + OPTS_TYPE_LOOP2 = (1ULL << 36), + OPTS_TYPE_AUX1 = (1ULL << 37), + OPTS_TYPE_AUX2 = (1ULL << 38), + OPTS_TYPE_AUX3 = (1ULL << 39), + OPTS_TYPE_AUX4 = (1ULL << 40), + OPTS_TYPE_BINARY_HASHFILE = (1ULL << 41), OPTS_TYPE_BINARY_HASHFILE_OPTIONAL - = (1ULL << 40), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective. - OPTS_TYPE_PT_ADD06 = (1ULL << 41), - OPTS_TYPE_KEYBOARD_MAPPING = (1ULL << 42), - OPTS_TYPE_DEEP_COMP_KERNEL = (1ULL << 43), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately - OPTS_TYPE_TM_KERNEL = (1ULL << 44), - OPTS_TYPE_SUGGEST_KG = (1ULL << 45), // suggest keep guessing for modules the user maybe wants to use --keep-guessing - OPTS_TYPE_COPY_TMPS = (1ULL << 46), // if we want to use data from tmps buffer (for example get the PMK in WPA) - OPTS_TYPE_POTFILE_NOPASS = (1ULL << 47), // sometimes the password should not be printed to potfile - OPTS_TYPE_DYNAMIC_SHARED = (1ULL << 48), // use dynamic shared memory (note: needs special kernel changes) - OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 49), // some algos use JiT in combinations with a salt or create too much startup time - OPTS_TYPE_MP_MULTI_DISABLE = (1ULL << 50), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings - OPTS_TYPE_NATIVE_THREADS = (1ULL << 51), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps) + = (1ULL << 42), // this allows us to not enforce the use of a binary file. requires OPTS_TYPE_BINARY_HASHFILE set to be effective. + OPTS_TYPE_PT_ADD06 = (1ULL << 43), + OPTS_TYPE_KEYBOARD_MAPPING = (1ULL << 44), + OPTS_TYPE_DEEP_COMP_KERNEL = (1ULL << 45), // if we have to iterate through each hash inside the comp kernel, for example if each hash has to be decrypted separately + OPTS_TYPE_TM_KERNEL = (1ULL << 46), + OPTS_TYPE_SUGGEST_KG = (1ULL << 47), // suggest keep guessing for modules the user maybe wants to use --keep-guessing + OPTS_TYPE_COPY_TMPS = (1ULL << 48), // if we want to use data from tmps buffer (for example get the PMK in WPA) + OPTS_TYPE_POTFILE_NOPASS = (1ULL << 49), // sometimes the password should not be printed to potfile + OPTS_TYPE_DYNAMIC_SHARED = (1ULL << 50), // use dynamic shared memory (note: needs special kernel changes) + OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 51), // some algos use JiT in combinations with a salt or create too much startup time + OPTS_TYPE_MP_MULTI_DISABLE = (1ULL << 52), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings + OPTS_TYPE_NATIVE_THREADS = (1ULL << 53), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps) } opts_type_t; @@ -1094,12 +1099,14 @@ typedef struct hc_device_param u32 kernel_wgs1; u32 kernel_wgs12; + u32 kernel_wgs2p; u32 kernel_wgs2; u32 kernel_wgs2e; u32 kernel_wgs23; u32 kernel_wgs3; u32 kernel_wgs4; u32 kernel_wgs_init2; + u32 kernel_wgs_loop2p; u32 kernel_wgs_loop2; u32 kernel_wgs_mp; u32 kernel_wgs_mp_l; @@ -1116,12 +1123,14 @@ typedef struct hc_device_param u32 kernel_preferred_wgs_multiple1; u32 kernel_preferred_wgs_multiple12; + u32 kernel_preferred_wgs_multiple2p; u32 kernel_preferred_wgs_multiple2; u32 kernel_preferred_wgs_multiple2e; u32 kernel_preferred_wgs_multiple23; u32 kernel_preferred_wgs_multiple3; u32 kernel_preferred_wgs_multiple4; u32 kernel_preferred_wgs_multiple_init2; + u32 kernel_preferred_wgs_multiple_loop2p; u32 kernel_preferred_wgs_multiple_loop2; u32 kernel_preferred_wgs_multiple_mp; u32 kernel_preferred_wgs_multiple_mp_l; @@ -1138,12 +1147,14 @@ typedef struct hc_device_param u64 kernel_local_mem_size1; u64 kernel_local_mem_size12; + u64 kernel_local_mem_size2p; u64 kernel_local_mem_size2; u64 kernel_local_mem_size2e; u64 kernel_local_mem_size23; u64 kernel_local_mem_size3; u64 kernel_local_mem_size4; u64 kernel_local_mem_size_init2; + u64 kernel_local_mem_size_loop2p; u64 kernel_local_mem_size_loop2; u64 kernel_local_mem_size_mp; u64 kernel_local_mem_size_mp_l; @@ -1160,12 +1171,14 @@ typedef struct hc_device_param u64 kernel_dynamic_local_mem_size1; u64 kernel_dynamic_local_mem_size12; + u64 kernel_dynamic_local_mem_size2p; u64 kernel_dynamic_local_mem_size2; u64 kernel_dynamic_local_mem_size2e; u64 kernel_dynamic_local_mem_size23; u64 kernel_dynamic_local_mem_size3; u64 kernel_dynamic_local_mem_size4; u64 kernel_dynamic_local_mem_size_init2; + u64 kernel_dynamic_local_mem_size_loop2p; u64 kernel_dynamic_local_mem_size_loop2; u64 kernel_dynamic_local_mem_size_mp; u64 kernel_dynamic_local_mem_size_mp_l; @@ -1273,11 +1286,13 @@ typedef struct hc_device_param // workaround cpu spinning double exec_us_prev1[EXPECTED_ITERATIONS]; + double exec_us_prev2p[EXPECTED_ITERATIONS]; double exec_us_prev2[EXPECTED_ITERATIONS]; double exec_us_prev2e[EXPECTED_ITERATIONS]; double exec_us_prev3[EXPECTED_ITERATIONS]; double exec_us_prev4[EXPECTED_ITERATIONS]; double exec_us_prev_init2[EXPECTED_ITERATIONS]; + double exec_us_prev_loop2p[EXPECTED_ITERATIONS]; double exec_us_prev_loop2[EXPECTED_ITERATIONS]; double exec_us_prev_aux1[EXPECTED_ITERATIONS]; double exec_us_prev_aux2[EXPECTED_ITERATIONS]; @@ -1378,12 +1393,14 @@ typedef struct hc_device_param CUfunction cuda_function1; CUfunction cuda_function12; + CUfunction cuda_function2p; CUfunction cuda_function2; CUfunction cuda_function2e; CUfunction cuda_function23; CUfunction cuda_function3; CUfunction cuda_function4; CUfunction cuda_function_init2; + CUfunction cuda_function_loop2p; CUfunction cuda_function_loop2; CUfunction cuda_function_mp; CUfunction cuda_function_mp_l; @@ -1462,12 +1479,14 @@ typedef struct hc_device_param cl_kernel opencl_kernel1; cl_kernel opencl_kernel12; + cl_kernel opencl_kernel2p; cl_kernel opencl_kernel2; cl_kernel opencl_kernel2e; cl_kernel opencl_kernel23; cl_kernel opencl_kernel3; cl_kernel opencl_kernel4; cl_kernel opencl_kernel_init2; + cl_kernel opencl_kernel_loop2p; cl_kernel opencl_kernel_loop2; cl_kernel opencl_kernel_mp; cl_kernel opencl_kernel_mp_l; diff --git a/src/backend.c b/src/backend.c index 583d0712d..79c4944c9 100644 --- a/src/backend.c +++ b/src/backend.c @@ -2998,11 +2998,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, } else { - bool run_init = true; - bool run_loop = true; - bool run_comp = true; - - if (run_init == true) + if (true) { if (device_param->is_cuda == true) { @@ -3089,165 +3085,190 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, } } - if (run_loop == true) + if (true) { - u32 iter = hashes->salts_buf[salt_pos].salt_iter; + const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats; - u32 loop_step = device_param->kernel_loops; - - for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++) + for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++) { - u32 loop_left = iter - loop_pos; + device_param->kernel_params_buf32[34] = salt_repeat; - loop_left = MIN (loop_left, loop_step); - - device_param->kernel_params_buf32[28] = loop_pos; - device_param->kernel_params_buf32[29] = loop_left; - - if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; - - if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED) + if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) { - if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, pws_pos, pws_cnt, false, 0) == -1) return -1; } - //bug? - //while (status_ctx->run_thread_level2 == false) break; - if (status_ctx->run_thread_level2 == false) break; - - /** - * speed - */ - - const float iter_part = (float) (loop_pos + loop_left) / iter; - - const u64 perf_sum_all = (u64) (pws_cnt * iter_part); - - double speed_msec = hc_timer_get (device_param->timer_speed); - - const u32 speed_pos = device_param->speed_pos; - - device_param->speed_cnt[speed_pos] = perf_sum_all; - - device_param->speed_msec[speed_pos] = speed_msec; - - if (user_options->speed_only == true) + if (true) { - if (speed_msec > 4000) + const u32 iter = hashes->salts_buf[salt_pos].salt_iter; + + const u32 loop_step = device_param->kernel_loops; + + for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++) { - device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left); + u32 loop_left = iter - loop_pos; - device_param->speed_pos = 1; + loop_left = MIN (loop_left, loop_step); - device_param->speed_only_finish = true; + device_param->kernel_params_buf32[28] = loop_pos; + device_param->kernel_params_buf32[29] = loop_left; - return 0; + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; + + if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED) + { + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_2E, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; + } + + //bug? + //while (status_ctx->run_thread_level2 == false) break; + if (status_ctx->run_thread_level2 == false) break; + + /** + * speed + */ + + const float iter_part = (float) (loop_pos + loop_left) / iter; + + const u64 perf_sum_all = (u64) (pws_cnt * iter_part); + + double speed_msec = hc_timer_get (device_param->timer_speed); + + const u32 speed_pos = device_param->speed_pos; + + device_param->speed_cnt[speed_pos] = perf_sum_all; + + device_param->speed_msec[speed_pos] = speed_msec; + + if (user_options->speed_only == true) + { + if (speed_msec > 4000) + { + device_param->outerloop_multi *= (double) iter / (double) (loop_pos + loop_left); + + device_param->speed_pos = 1; + + device_param->speed_only_finish = true; + + return 0; + } + } + } + + if (hashconfig->opts_type & OPTS_TYPE_HOOK23) + { + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1; + + if (device_param->is_cuda == true) + { + if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1; + } + + if (device_param->is_opencl == true) + { + if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1; + } + + const int hook_threads = (int) user_options->hook_threads; + + hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t)); + + for (int i = 0; i < hook_threads; i++) + { + hook_thread_param_t *hook_thread_param = hook_threads_param + i; + + hook_thread_param->tid = i; + hook_thread_param->tsz = hook_threads; + + hook_thread_param->module_ctx = module_ctx; + hook_thread_param->status_ctx = status_ctx; + + hook_thread_param->device_param = device_param; + + hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i]; + hook_thread_param->hook_salts_buf = hashes->hook_salts_buf; + + hook_thread_param->salt_pos = salt_pos; + + hook_thread_param->pws_cnt = pws_cnt; + } + + hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t)); + + for (int i = 0; i < hook_threads; i++) + { + hook_thread_param_t *hook_thread_param = hook_threads_param + i; + + hc_thread_create (c_threads[i], hook23_thread, hook_thread_param); + } + + hc_thread_wait (hook_threads, c_threads); + + hcfree (c_threads); + + hcfree (hook_threads_param); + + if (device_param->is_cuda == true) + { + if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1; + } + + if (device_param->is_opencl == true) + { + if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1; + } } } } - - if (hashconfig->opts_type & OPTS_TYPE_HOOK23) - { - if (run_kernel (hashcat_ctx, device_param, KERN_RUN_23, pws_pos, pws_cnt, false, 0) == -1) return -1; - - if (device_param->is_cuda == true) - { - if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1; - } - - if (device_param->is_opencl == true) - { - if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1; - } - - const int hook_threads = (int) user_options->hook_threads; - - hook_thread_param_t *hook_threads_param = (hook_thread_param_t *) hccalloc (hook_threads, sizeof (hook_thread_param_t)); - - for (int i = 0; i < hook_threads; i++) - { - hook_thread_param_t *hook_thread_param = hook_threads_param + i; - - hook_thread_param->tid = i; - hook_thread_param->tsz = hook_threads; - - hook_thread_param->module_ctx = module_ctx; - hook_thread_param->status_ctx = status_ctx; - - hook_thread_param->device_param = device_param; - - hook_thread_param->hook_extra_param = module_ctx->hook_extra_params[i]; - hook_thread_param->hook_salts_buf = hashes->hook_salts_buf; - - hook_thread_param->salt_pos = salt_pos; - - hook_thread_param->pws_cnt = pws_cnt; - } - - hc_thread_t *c_threads = (hc_thread_t *) hccalloc (hook_threads, sizeof (hc_thread_t)); - - for (int i = 0; i < hook_threads; i++) - { - hook_thread_param_t *hook_thread_param = hook_threads_param + i; - - hc_thread_create (c_threads[i], hook23_thread, hook_thread_param); - } - - hc_thread_wait (hook_threads, c_threads); - - hcfree (c_threads); - - hcfree (hook_threads_param); - - if (device_param->is_cuda == true) - { - if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1; - } - - if (device_param->is_opencl == true) - { - if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, pws_cnt * hashconfig->hook_size, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1; - } - } } - // init2 and loop2 are kind of special, we use run_loop for them, too + // note: they also do not influence the performance screen + // in case you want to use this, this cane make sense only if your input data comes out of tmps[] - if (run_loop == true) + if (hashconfig->opts_type & OPTS_TYPE_INIT2) { - // note: they also do not influence the performance screen - // in case you want to use this, this cane make sense only if your input data comes out of tmps[] + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1; + } - if (hashconfig->opts_type & OPTS_TYPE_INIT2) + if (true) + { + const u32 salt_repeats = hashes->salts_buf[salt_pos].salt_repeats; + + for (u32 salt_repeat = 0; salt_repeat <= salt_repeats; salt_repeat++) { - if (run_kernel (hashcat_ctx, device_param, KERN_RUN_INIT2, pws_pos, pws_cnt, false, 0) == -1) return -1; - } + device_param->kernel_params_buf32[34] = salt_repeat; - if (hashconfig->opts_type & OPTS_TYPE_LOOP2) - { - u32 iter = hashes->salts_buf[salt_pos].salt_iter2; - - u32 loop_step = device_param->kernel_loops; - - for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++) + if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE) { - u32 loop_left = iter - loop_pos; + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2P, pws_pos, pws_cnt, false, 0) == -1) return -1; + } - loop_left = MIN (loop_left, loop_step); + if (hashconfig->opts_type & OPTS_TYPE_LOOP2) + { + u32 iter = hashes->salts_buf[salt_pos].salt_iter2; - device_param->kernel_params_buf32[28] = loop_pos; - device_param->kernel_params_buf32[29] = loop_left; + u32 loop_step = device_param->kernel_loops; - if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; + for (u32 loop_pos = 0, slow_iteration = 0; loop_pos < iter; loop_pos += loop_step, slow_iteration++) + { + u32 loop_left = iter - loop_pos; - //bug? - //while (status_ctx->run_thread_level2 == false) break; - if (status_ctx->run_thread_level2 == false) break; + loop_left = MIN (loop_left, loop_step); + + device_param->kernel_params_buf32[28] = loop_pos; + device_param->kernel_params_buf32[29] = loop_left; + + if (run_kernel (hashcat_ctx, device_param, KERN_RUN_LOOP2, pws_pos, pws_cnt, true, slow_iteration) == -1) return -1; + + //bug? + //while (status_ctx->run_thread_level2 == false) break; + if (status_ctx->run_thread_level2 == false) break; + } } } } - if (run_comp == true) + if (true) { if (hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL) { @@ -3525,6 +3546,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con kernel_threads = device_param->kernel_wgs12; dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size12; break; + case KERN_RUN_2P: + kernel_threads = device_param->kernel_wgs2p; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2p; + break; case KERN_RUN_2: kernel_threads = device_param->kernel_wgs2; dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size2; @@ -3549,6 +3574,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con kernel_threads = device_param->kernel_wgs_init2; dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_init2; break; + case KERN_RUN_LOOP2P: + kernel_threads = device_param->kernel_wgs_loop2p; + dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2p; + break; case KERN_RUN_LOOP2: kernel_threads = device_param->kernel_wgs_loop2; dynamic_shared_mem = device_param->kernel_dynamic_local_mem_size_loop2; @@ -3590,8 +3619,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con kernel_threads = MIN (kernel_threads, device_param->kernel_threads); - device_param->kernel_params_buf64[34] = pws_pos; - device_param->kernel_params_buf64[35] = num; + device_param->kernel_params_buf64[35] = pws_pos; + device_param->kernel_params_buf64[36] = num; u64 num_elements = num; @@ -3603,19 +3632,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con { switch (kern_run) { - case KERN_RUN_1: cuda_function = device_param->cuda_function1; break; - case KERN_RUN_12: cuda_function = device_param->cuda_function12; break; - case KERN_RUN_2: cuda_function = device_param->cuda_function2; break; - case KERN_RUN_2E: cuda_function = device_param->cuda_function2e; break; - case KERN_RUN_23: cuda_function = device_param->cuda_function23; break; - case KERN_RUN_3: cuda_function = device_param->cuda_function3; break; - case KERN_RUN_4: cuda_function = device_param->cuda_function4; break; - case KERN_RUN_INIT2: cuda_function = device_param->cuda_function_init2; break; - case KERN_RUN_LOOP2: cuda_function = device_param->cuda_function_loop2; break; - case KERN_RUN_AUX1: cuda_function = device_param->cuda_function_aux1; break; - case KERN_RUN_AUX2: cuda_function = device_param->cuda_function_aux2; break; - case KERN_RUN_AUX3: cuda_function = device_param->cuda_function_aux3; break; - case KERN_RUN_AUX4: cuda_function = device_param->cuda_function_aux4; break; + case KERN_RUN_1: cuda_function = device_param->cuda_function1; break; + case KERN_RUN_12: cuda_function = device_param->cuda_function12; break; + case KERN_RUN_2P: cuda_function = device_param->cuda_function2p; break; + case KERN_RUN_2: cuda_function = device_param->cuda_function2; break; + case KERN_RUN_2E: cuda_function = device_param->cuda_function2e; break; + case KERN_RUN_23: cuda_function = device_param->cuda_function23; break; + case KERN_RUN_3: cuda_function = device_param->cuda_function3; break; + case KERN_RUN_4: cuda_function = device_param->cuda_function4; break; + case KERN_RUN_INIT2: cuda_function = device_param->cuda_function_init2; break; + case KERN_RUN_LOOP2P: cuda_function = device_param->cuda_function_loop2p; break; + case KERN_RUN_LOOP2: cuda_function = device_param->cuda_function_loop2; break; + case KERN_RUN_AUX1: cuda_function = device_param->cuda_function_aux1; break; + case KERN_RUN_AUX2: cuda_function = device_param->cuda_function_aux2; break; + case KERN_RUN_AUX3: cuda_function = device_param->cuda_function_aux3; break; + case KERN_RUN_AUX4: cuda_function = device_param->cuda_function_aux4; break; } if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1; @@ -3700,19 +3731,21 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con { switch (kern_run) { - case KERN_RUN_1: opencl_kernel = device_param->opencl_kernel1; break; - case KERN_RUN_12: opencl_kernel = device_param->opencl_kernel12; break; - case KERN_RUN_2: opencl_kernel = device_param->opencl_kernel2; break; - case KERN_RUN_2E: opencl_kernel = device_param->opencl_kernel2e; break; - case KERN_RUN_23: opencl_kernel = device_param->opencl_kernel23; break; - case KERN_RUN_3: opencl_kernel = device_param->opencl_kernel3; break; - case KERN_RUN_4: opencl_kernel = device_param->opencl_kernel4; break; - case KERN_RUN_INIT2: opencl_kernel = device_param->opencl_kernel_init2; break; - case KERN_RUN_LOOP2: opencl_kernel = device_param->opencl_kernel_loop2; break; - case KERN_RUN_AUX1: opencl_kernel = device_param->opencl_kernel_aux1; break; - case KERN_RUN_AUX2: opencl_kernel = device_param->opencl_kernel_aux2; break; - case KERN_RUN_AUX3: opencl_kernel = device_param->opencl_kernel_aux3; break; - case KERN_RUN_AUX4: opencl_kernel = device_param->opencl_kernel_aux4; break; + case KERN_RUN_1: opencl_kernel = device_param->opencl_kernel1; break; + case KERN_RUN_12: opencl_kernel = device_param->opencl_kernel12; break; + case KERN_RUN_2P: opencl_kernel = device_param->opencl_kernel2p; break; + case KERN_RUN_2: opencl_kernel = device_param->opencl_kernel2; break; + case KERN_RUN_2E: opencl_kernel = device_param->opencl_kernel2e; break; + case KERN_RUN_23: opencl_kernel = device_param->opencl_kernel23; break; + case KERN_RUN_3: opencl_kernel = device_param->opencl_kernel3; break; + case KERN_RUN_4: opencl_kernel = device_param->opencl_kernel4; break; + case KERN_RUN_INIT2: opencl_kernel = device_param->opencl_kernel_init2; break; + case KERN_RUN_LOOP2P: opencl_kernel = device_param->opencl_kernel_loop2p; break; + case KERN_RUN_LOOP2: opencl_kernel = device_param->opencl_kernel_loop2; break; + case KERN_RUN_AUX1: opencl_kernel = device_param->opencl_kernel_aux1; break; + case KERN_RUN_AUX2: opencl_kernel = device_param->opencl_kernel_aux2; break; + case KERN_RUN_AUX3: opencl_kernel = device_param->opencl_kernel_aux3; break; + case KERN_RUN_AUX4: opencl_kernel = device_param->opencl_kernel_aux4; break; } } @@ -3721,12 +3754,12 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1; } - for (u32 i = 24; i <= 33; i++) + for (u32 i = 24; i <= 34; i++) { if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]) == -1) return -1; } - for (u32 i = 34; i <= 35; i++) + for (u32 i = 35; i <= 36; i++) { if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]) == -1) return -1; } @@ -3786,17 +3819,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con { switch (kern_run) { - case KERN_RUN_1: if (device_param->exec_us_prev1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_2: if (device_param->exec_us_prev2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_2E: if (device_param->exec_us_prev2e[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_3: if (device_param->exec_us_prev3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_4: if (device_param->exec_us_prev4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_INIT2: if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_LOOP2: if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX1: if (device_param->exec_us_prev_aux1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX2: if (device_param->exec_us_prev_aux2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX3: if (device_param->exec_us_prev_aux3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX4: if (device_param->exec_us_prev_aux4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_1: if (device_param->exec_us_prev1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_2P: if (device_param->exec_us_prev2p[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2p[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_2: if (device_param->exec_us_prev2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_2E: if (device_param->exec_us_prev2e[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2e[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_3: if (device_param->exec_us_prev3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_4: if (device_param->exec_us_prev4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_INIT2: if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_LOOP2P: if (device_param->exec_us_prev_loop2p[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2p[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_LOOP2: if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX1: if (device_param->exec_us_prev_aux1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX2: if (device_param->exec_us_prev_aux2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX3: if (device_param->exec_us_prev_aux3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX4: if (device_param->exec_us_prev_aux4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm] * device_param->spin_damp)); break; } } else @@ -3830,17 +3865,19 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con { switch (kern_run) { - case KERN_RUN_1: device_param->exec_us_prev1[iterationm] = exec_us; break; - case KERN_RUN_2: device_param->exec_us_prev2[iterationm] = exec_us; break; - case KERN_RUN_2E: device_param->exec_us_prev2e[iterationm] = exec_us; break; - case KERN_RUN_3: device_param->exec_us_prev3[iterationm] = exec_us; break; - case KERN_RUN_4: device_param->exec_us_prev4[iterationm] = exec_us; break; - case KERN_RUN_INIT2: device_param->exec_us_prev_init2[iterationm] = exec_us; break; - case KERN_RUN_LOOP2: device_param->exec_us_prev_loop2[iterationm] = exec_us; break; - case KERN_RUN_AUX1: device_param->exec_us_prev_aux1[iterationm] = exec_us; break; - case KERN_RUN_AUX2: device_param->exec_us_prev_aux2[iterationm] = exec_us; break; - case KERN_RUN_AUX3: device_param->exec_us_prev_aux3[iterationm] = exec_us; break; - case KERN_RUN_AUX4: device_param->exec_us_prev_aux4[iterationm] = exec_us; break; + case KERN_RUN_1: device_param->exec_us_prev1[iterationm] = exec_us; break; + case KERN_RUN_2P: device_param->exec_us_prev2p[iterationm] = exec_us; break; + case KERN_RUN_2: device_param->exec_us_prev2[iterationm] = exec_us; break; + case KERN_RUN_2E: device_param->exec_us_prev2e[iterationm] = exec_us; break; + case KERN_RUN_3: device_param->exec_us_prev3[iterationm] = exec_us; break; + case KERN_RUN_4: device_param->exec_us_prev4[iterationm] = exec_us; break; + case KERN_RUN_INIT2: device_param->exec_us_prev_init2[iterationm] = exec_us; break; + case KERN_RUN_LOOP2P: device_param->exec_us_prev_loop2p[iterationm] = exec_us; break; + case KERN_RUN_LOOP2: device_param->exec_us_prev_loop2[iterationm] = exec_us; break; + case KERN_RUN_AUX1: device_param->exec_us_prev_aux1[iterationm] = exec_us; break; + case KERN_RUN_AUX2: device_param->exec_us_prev_aux2[iterationm] = exec_us; break; + case KERN_RUN_AUX3: device_param->exec_us_prev_aux3[iterationm] = exec_us; break; + case KERN_RUN_AUX4: device_param->exec_us_prev_aux4[iterationm] = exec_us; break; } } } @@ -9086,8 +9123,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_buf32[31] = 0; // digests_cnt device_param->kernel_params_buf32[32] = 0; // digests_offset device_param->kernel_params_buf32[33] = 0; // combs_mode - device_param->kernel_params_buf64[34] = 0; // pws_pos - device_param->kernel_params_buf64[35] = 0; // gid_max + device_param->kernel_params_buf32[34] = 0; // salt_repeat + device_param->kernel_params_buf64[35] = 0; // pws_pos + device_param->kernel_params_buf64[36] = 0; // gid_max if (device_param->is_cuda == true) { @@ -9155,8 +9193,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params[31] = &device_param->kernel_params_buf32[31]; device_param->kernel_params[32] = &device_param->kernel_params_buf32[32]; device_param->kernel_params[33] = &device_param->kernel_params_buf32[33]; - device_param->kernel_params[34] = &device_param->kernel_params_buf64[34]; + device_param->kernel_params[34] = &device_param->kernel_params_buf32[34]; device_param->kernel_params[35] = &device_param->kernel_params_buf64[35]; + device_param->kernel_params[36] = &device_param->kernel_params_buf64[36]; if (user_options->slow_candidates == true) { @@ -9554,6 +9593,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_preferred_wgs_multiple3 = device_param->cuda_warp_size; + if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) + { + // kernel2p + + snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type); + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function2p, device_param->cuda_module, kernel_name) == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_wgs2p) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_local_mem_size2p) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple2p = device_param->cuda_warp_size; + } + if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED) { // kernel2e @@ -9622,6 +9678,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_preferred_wgs_multiple_init2 = device_param->cuda_warp_size; } + // loop2 prepare + + if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE) + { + snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type); + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_loop2p, device_param->cuda_module, kernel_name) == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1; + + if (get_cuda_kernel_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1; + + if (get_cuda_kernel_dynamic_local_mem_size (hashcat_ctx, device_param->cuda_function_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1; + + device_param->kernel_preferred_wgs_multiple_loop2p = device_param->cuda_warp_size; + } + // loop2 if (hashconfig->opts_type & OPTS_TYPE_LOOP2) @@ -10142,6 +10215,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // aux1 + if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) + { + snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_prepare", kern_type); + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel2p) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_wgs2p) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_local_mem_size2p) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_dynamic_local_mem_size2p) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel2p, &device_param->kernel_preferred_wgs_multiple2p) == -1) return -1; + } + if (hashconfig->opts_type & OPTS_TYPE_LOOP_EXTENDED) { snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop_extended", kern_type); @@ -10208,6 +10296,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_init2, &device_param->kernel_preferred_wgs_multiple_init2) == -1) return -1; } + // loop2 prepare + + if (hashconfig->opts_type & OPTS_TYPE_LOOP2_PREPARE) + { + snprintf (kernel_name, sizeof (kernel_name), "m%05u_loop2_prepare", kern_type); + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program, kernel_name, &device_param->opencl_kernel_loop2p) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_wgs_loop2p) == -1) return -1; + + if (get_opencl_kernel_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_local_mem_size_loop2p) == -1) return -1; + + if (get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_dynamic_local_mem_size_loop2p) == -1) return -1; + + if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_loop2p, &device_param->kernel_preferred_wgs_multiple_loop2p) == -1) return -1; + } + // loop2 if (hashconfig->opts_type & OPTS_TYPE_LOOP2) @@ -11071,12 +11176,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->cuda_function1 = NULL; device_param->cuda_function12 = NULL; + device_param->cuda_function2p = NULL; device_param->cuda_function2 = NULL; device_param->cuda_function2e = NULL; device_param->cuda_function23 = NULL; device_param->cuda_function3 = NULL; device_param->cuda_function4 = NULL; device_param->cuda_function_init2 = NULL; + device_param->cuda_function_loop2p = NULL; device_param->cuda_function_loop2 = NULL; device_param->cuda_function_mp = NULL; device_param->cuda_function_mp_l = NULL; @@ -11139,12 +11246,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->opencl_kernel1) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel1); if (device_param->opencl_kernel12) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel12); + if (device_param->opencl_kernel2p) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2p); if (device_param->opencl_kernel2) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2); if (device_param->opencl_kernel2e) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel2e); if (device_param->opencl_kernel23) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel23); if (device_param->opencl_kernel3) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel3); if (device_param->opencl_kernel4) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel4); if (device_param->opencl_kernel_init2) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_init2); + if (device_param->opencl_kernel_loop2p) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2p); if (device_param->opencl_kernel_loop2) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_loop2); if (device_param->opencl_kernel_mp) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp); if (device_param->opencl_kernel_mp_l) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_mp_l); @@ -11205,12 +11314,14 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->opencl_d_st_esalts_buf = NULL; device_param->opencl_kernel1 = NULL; device_param->opencl_kernel12 = NULL; + device_param->opencl_kernel2p = NULL; device_param->opencl_kernel2 = NULL; device_param->opencl_kernel2e = NULL; device_param->opencl_kernel23 = NULL; device_param->opencl_kernel3 = NULL; device_param->opencl_kernel4 = NULL; device_param->opencl_kernel_init2 = NULL; + device_param->opencl_kernel_loop2p = NULL; device_param->opencl_kernel_loop2 = NULL; device_param->opencl_kernel_mp = NULL; device_param->opencl_kernel_mp_l = NULL; diff --git a/src/modules/module_02500.c b/src/modules/module_02500.c index b46ffc4f1..aede25a2f 100644 --- a/src/modules/module_02500.c +++ b/src/modules/module_02500.c @@ -579,6 +579,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_02501.c b/src/modules/module_02501.c index 80d41ebba..e29a64065 100644 --- a/src/modules/module_02501.c +++ b/src/modules/module_02501.c @@ -554,6 +554,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index 766a11213..5b40e989a 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -21,6 +21,7 @@ static const char *HASH_NAME = "bcrypt $2*$, Blowfish (Unix)"; static const u64 KERN_TYPE = 3200; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_MP_MULTI_DISABLE | OPTS_TYPE_DYNAMIC_SHARED; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; diff --git a/src/modules/module_08900.c b/src/modules/module_08900.c index 277b90330..6343543da 100644 --- a/src/modules/module_08900.c +++ b/src/modules/module_08900.c @@ -24,6 +24,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_MP_MULTI_DISABLE | OPTS_TYPE_NATIVE_THREADS + | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_SELF_TEST_DISABLE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; @@ -63,14 +64,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_min = 1; + const u32 kernel_loops_min = 1024; return kernel_loops_min; } u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_max = 1; + const u32 kernel_loops_max = 1024; return kernel_loops_max; } @@ -330,6 +331,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE salt->scrypt_r = hc_strtoul ((const char *) r_pos, NULL, 10); salt->scrypt_p = hc_strtoul ((const char *) p_pos, NULL, 10); + salt->salt_iter = salt->scrypt_N; + salt->salt_repeats = salt->scrypt_p - 1; + + if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed + // salt const u8 *salt_pos = token.buf[4]; @@ -341,8 +347,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE memcpy (salt->salt_buf, tmp_buf, tmp_len); - salt->salt_len = tmp_len; - salt->salt_iter = 1; + salt->salt_len = tmp_len; // digest - base64 decode diff --git a/src/modules/module_09300.c b/src/modules/module_09300.c index 73b130663..fbf5a6064 100644 --- a/src/modules/module_09300.c +++ b/src/modules/module_09300.c @@ -24,6 +24,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_MP_MULTI_DISABLE | OPTS_TYPE_NATIVE_THREADS + | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_SELF_TEST_DISABLE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; @@ -52,14 +53,14 @@ static const u64 SCRYPT_P = 1; u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_min = 1; + const u32 kernel_loops_min = 1024; return kernel_loops_min; } u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_max = 1; + const u32 kernel_loops_max = 1024; return kernel_loops_max; } @@ -299,11 +300,14 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE memcpy (salt_buf_ptr, salt_pos, salt_len); salt->salt_len = salt_len; - salt->salt_iter = 1; - salt->scrypt_N = 16384; - salt->scrypt_r = 1; - salt->scrypt_p = 1; + salt->scrypt_N = SCRYPT_N; + salt->scrypt_r = SCRYPT_R; + salt->scrypt_p = SCRYPT_P; + + salt->salt_iter = salt->scrypt_N; + salt->salt_repeats = salt->scrypt_p - 1; + // base64 decode hash diff --git a/src/modules/module_15700.c b/src/modules/module_15700.c index 4b473410e..e76cc1427 100644 --- a/src/modules/module_15700.c +++ b/src/modules/module_15700.c @@ -24,6 +24,7 @@ static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_MP_MULTI_DISABLE | OPTS_TYPE_NATIVE_THREADS + | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_SELF_TEST_DISABLE | OPTS_TYPE_ST_HEX; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; @@ -60,14 +61,14 @@ static const u64 SCRYPT_P = 1; u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_min = 1; + const u32 kernel_loops_min = 1024; return kernel_loops_min; } u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_max = 1; + const u32 kernel_loops_max = 1024; return kernel_loops_max; } @@ -349,6 +350,11 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE salt->scrypt_r = scrypt_r; salt->scrypt_p = scrypt_p; + salt->salt_iter = salt->scrypt_N; + salt->salt_repeats = salt->scrypt_p - 1; + + if (salt->scrypt_N % 1024) return (PARSER_SALT_VALUE); // we set loop count to 1024 fixed + // salt const u8 *salt_pos = token.buf[4]; @@ -367,8 +373,6 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE ethereum_scrypt->salt_buf[6] = salt->salt_buf[6]; ethereum_scrypt->salt_buf[7] = salt->salt_buf[7]; - salt->salt_iter = 1; - // ciphertext const u8 *ciphertext_pos = token.buf[5]; diff --git a/src/modules/module_16800.c b/src/modules/module_16800.c index 15c0e343f..625d2fbf0 100644 --- a/src/modules/module_16800.c +++ b/src/modules/module_16800.c @@ -290,6 +290,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_16801.c b/src/modules/module_16801.c index 3324fa005..6d237ebf5 100644 --- a/src/modules/module_16801.c +++ b/src/modules/module_16801.c @@ -312,6 +312,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_22000.c b/src/modules/module_22000.c index 996f6eda5..2b5f60cfb 100644 --- a/src/modules/module_22000.c +++ b/src/modules/module_22000.c @@ -600,6 +600,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_22001.c b/src/modules/module_22001.c index 5b8737c3d..dd45f3bd2 100644 --- a/src/modules/module_22001.c +++ b/src/modules/module_22001.c @@ -601,6 +601,7 @@ bool module_potfile_custom_check (MAYBE_UNUSED const hashconfig_t *hashconfig, M 1, // digests_cnt 0, // digests_offset 0, // combs_mode + 0, // salt_repeat 0, // pws_pos 1 // gid_max ); diff --git a/src/modules/module_22700.c b/src/modules/module_22700.c index f866bc235..6a82768d3 100644 --- a/src/modules/module_22700.c +++ b/src/modules/module_22700.c @@ -25,6 +25,7 @@ static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_BE | OPTS_TYPE_PT_UTF16BE | OPTS_TYPE_MP_MULTI_DISABLE | OPTS_TYPE_NATIVE_THREADS + | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_SELF_TEST_DISABLE; static const u32 SALT_TYPE = SALT_TYPE_EMBEDDED; static const char *ST_PASS = "hashcat"; @@ -64,14 +65,14 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_min = 1; + const u32 kernel_loops_min = 1024; return kernel_loops_min; } u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_max = 1; + const u32 kernel_loops_max = 1024; return kernel_loops_max; } @@ -320,6 +321,9 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE salt->scrypt_r = SCRYPT_R; salt->scrypt_p = SCRYPT_P; + salt->salt_iter = salt->scrypt_N; + salt->salt_repeats = salt->scrypt_p - 1; + // version const u8 *version_pos = token.buf[1]; @@ -353,8 +357,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE salt->salt_buf[10] = hex_to_u32 (b2_pos + 16); salt->salt_buf[11] = hex_to_u32 (b2_pos + 24); - salt->salt_len = 48; - salt->salt_iter = 1; + salt->salt_len = 48; // fake digest: