From e0e796bc2d3a486d31e6514ffa2825d4b94d43cf Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 15 Feb 2018 15:35:22 +0100 Subject: [PATCH] More optimized -m 500, -m 1600 and -m 6300 pure kernel --- OpenCL/m00500.cl | 74 +++++++++++++++++++++++++----------------------- OpenCL/m01600.cl | 74 +++++++++++++++++++++++++----------------------- OpenCL/m06300.cl | 74 +++++++++++++++++++++++++----------------------- 3 files changed, 117 insertions(+), 105 deletions(-) diff --git a/OpenCL/m00500.cl b/OpenCL/m00500.cl index d657efa3c..3657a140a 100644 --- a/OpenCL/m00500.cl +++ b/OpenCL/m00500.cl @@ -15,35 +15,11 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" +#define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) +#define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] + #define md5crypt_magic 0x00243124u -DECLSPEC void md5_transform_transport (const u32 *w, u32 *digest) -{ - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; - - t0[0] = w[ 0]; - t0[1] = w[ 1]; - t0[2] = w[ 2]; - t0[3] = w[ 3]; - t1[0] = w[ 4]; - t1[1] = w[ 5]; - t1[2] = w[ 6]; - t1[3] = w[ 7]; - t2[0] = w[ 8]; - t2[1] = w[ 9]; - t2[2] = w[10]; - t2[3] = w[11]; - t3[0] = w[12]; - t3[1] = w[13]; - t3[2] = w[14]; - t3[3] = w[15]; - - md5_transform (t0, t1, t2, t3, digest); -} - __kernel void m00500_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global md5crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** @@ -208,9 +184,6 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul u32 wpc[8][64 + 64 + 64 + 64]; - #define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) - #define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] - #ifdef _unroll #pragma unroll #endif @@ -283,12 +256,43 @@ __kernel void m00500_loop (__global pw_t *pws, __global const kernel_rule_t *rul if (j1) { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = wpc_len[pc] - 16; k < 16; k++, p++) + const u32 off = wpc_len[pc] / 4; + const u32 mod = wpc_len[pc] % 4; + + u32 *ptr = wpc[pc] + off - 4; + + switch (mod) { - PUTCHAR_LE (wpc[pc], p, GETCHAR_LE (digest, k)); + case 0: + ptr[0] = digest[0]; + ptr[1] = digest[1]; + ptr[2] = digest[2]; + ptr[3] = digest[3]; + break; + + case 1: + ptr[0] = (ptr[0] & 0xff) | (digest[0] << 8); + ptr[1] = (digest[0] >> 24) | (digest[1] << 8); + ptr[2] = (digest[1] >> 24) | (digest[2] << 8); + ptr[3] = (digest[2] >> 24) | (digest[3] << 8); + ptr[4] = (digest[3] >> 24); + break; + + case 2: + ptr[0] = (ptr[0] & 0xffff) | (digest[0] << 16); + ptr[1] = (digest[0] >> 16) | (digest[1] << 16); + ptr[2] = (digest[1] >> 16) | (digest[2] << 16); + ptr[3] = (digest[2] >> 16) | (digest[3] << 16); + ptr[4] = (digest[3] >> 16); + break; + + case 3: + ptr[0] = (ptr[0] & 0xffffff) | (digest[0] << 24); + ptr[1] = (digest[0] >> 8) | (digest[1] << 24); + ptr[2] = (digest[1] >> 8) | (digest[2] << 24); + ptr[3] = (digest[2] >> 8) | (digest[3] << 24); + ptr[4] = (digest[3] >> 8); + break; } } else diff --git a/OpenCL/m01600.cl b/OpenCL/m01600.cl index a635eda9b..7dcfe1f26 100644 --- a/OpenCL/m01600.cl +++ b/OpenCL/m01600.cl @@ -15,36 +15,12 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" +#define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) +#define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] + #define md5apr1_magic0 0x72706124u #define md5apr1_magic1 0x00002431u -DECLSPEC void md5_transform_transport (const u32 *w, u32 *digest) -{ - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; - - t0[0] = w[ 0]; - t0[1] = w[ 1]; - t0[2] = w[ 2]; - t0[3] = w[ 3]; - t1[0] = w[ 4]; - t1[1] = w[ 5]; - t1[2] = w[ 6]; - t1[3] = w[ 7]; - t2[0] = w[ 8]; - t2[1] = w[ 9]; - t2[2] = w[10]; - t2[3] = w[11]; - t3[0] = w[12]; - t3[1] = w[13]; - t3[2] = w[14]; - t3[3] = w[15]; - - md5_transform (t0, t1, t2, t3, digest); -} - __kernel void m01600_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global md5crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** @@ -210,9 +186,6 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul u32 wpc[8][64 + 64 + 64 + 64]; - #define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) - #define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] - #ifdef _unroll #pragma unroll #endif @@ -285,12 +258,43 @@ __kernel void m01600_loop (__global pw_t *pws, __global const kernel_rule_t *rul if (j1) { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = wpc_len[pc] - 16; k < 16; k++, p++) + const u32 off = wpc_len[pc] / 4; + const u32 mod = wpc_len[pc] % 4; + + u32 *ptr = wpc[pc] + off - 4; + + switch (mod) { - PUTCHAR_LE (wpc[pc], p, GETCHAR_LE (digest, k)); + case 0: + ptr[0] = digest[0]; + ptr[1] = digest[1]; + ptr[2] = digest[2]; + ptr[3] = digest[3]; + break; + + case 1: + ptr[0] = (ptr[0] & 0xff) | (digest[0] << 8); + ptr[1] = (digest[0] >> 24) | (digest[1] << 8); + ptr[2] = (digest[1] >> 24) | (digest[2] << 8); + ptr[3] = (digest[2] >> 24) | (digest[3] << 8); + ptr[4] = (digest[3] >> 24); + break; + + case 2: + ptr[0] = (ptr[0] & 0xffff) | (digest[0] << 16); + ptr[1] = (digest[0] >> 16) | (digest[1] << 16); + ptr[2] = (digest[1] >> 16) | (digest[2] << 16); + ptr[3] = (digest[2] >> 16) | (digest[3] << 16); + ptr[4] = (digest[3] >> 16); + break; + + case 3: + ptr[0] = (ptr[0] & 0xffffff) | (digest[0] << 24); + ptr[1] = (digest[0] >> 8) | (digest[1] << 24); + ptr[2] = (digest[1] >> 8) | (digest[2] << 24); + ptr[3] = (digest[2] >> 8) | (digest[3] << 24); + ptr[4] = (digest[3] >> 8); + break; } } else diff --git a/OpenCL/m06300.cl b/OpenCL/m06300.cl index 0ed96ee2e..7f9f6a56b 100644 --- a/OpenCL/m06300.cl +++ b/OpenCL/m06300.cl @@ -12,36 +12,12 @@ #include "inc_common.cl" #include "inc_hash_md5.cl" +#define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) +#define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] + #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -DECLSPEC void md5_transform_transport (const u32 *w, u32 *digest) -{ - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; - - t0[0] = w[ 0]; - t0[1] = w[ 1]; - t0[2] = w[ 2]; - t0[3] = w[ 3]; - t1[0] = w[ 4]; - t1[1] = w[ 5]; - t1[2] = w[ 6]; - t1[3] = w[ 7]; - t2[0] = w[ 8]; - t2[1] = w[ 9]; - t2[2] = w[10]; - t2[3] = w[11]; - t3[0] = w[12]; - t3[1] = w[13]; - t3[2] = w[14]; - t3[3] = w[15]; - - md5_transform (t0, t1, t2, t3, digest); -} - __kernel void m06300_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global md5crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** @@ -200,9 +176,6 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul u32 wpc[8][64 + 64 + 64 + 64]; - #define PUTCHAR_LE(a,p,c) ((u8 *)(a))[(p)] = (u8) (c) - #define GETCHAR_LE(a,p) ((u8 *)(a))[(p)] - #ifdef _unroll #pragma unroll #endif @@ -275,12 +248,43 @@ __kernel void m06300_loop (__global pw_t *pws, __global const kernel_rule_t *rul if (j1) { - #ifdef _unroll - #pragma unroll - #endif - for (u32 k = 0, p = wpc_len[pc] - 16; k < 16; k++, p++) + const u32 off = wpc_len[pc] / 4; + const u32 mod = wpc_len[pc] % 4; + + u32 *ptr = wpc[pc] + off - 4; + + switch (mod) { - PUTCHAR_LE (wpc[pc], p, GETCHAR_LE (digest, k)); + case 0: + ptr[0] = digest[0]; + ptr[1] = digest[1]; + ptr[2] = digest[2]; + ptr[3] = digest[3]; + break; + + case 1: + ptr[0] = (ptr[0] & 0xff) | (digest[0] << 8); + ptr[1] = (digest[0] >> 24) | (digest[1] << 8); + ptr[2] = (digest[1] >> 24) | (digest[2] << 8); + ptr[3] = (digest[2] >> 24) | (digest[3] << 8); + ptr[4] = (digest[3] >> 24); + break; + + case 2: + ptr[0] = (ptr[0] & 0xffff) | (digest[0] << 16); + ptr[1] = (digest[0] >> 16) | (digest[1] << 16); + ptr[2] = (digest[1] >> 16) | (digest[2] << 16); + ptr[3] = (digest[2] >> 16) | (digest[3] << 16); + ptr[4] = (digest[3] >> 16); + break; + + case 3: + ptr[0] = (ptr[0] & 0xffffff) | (digest[0] << 24); + ptr[1] = (digest[0] >> 8) | (digest[1] << 24); + ptr[2] = (digest[1] >> 8) | (digest[2] << 24); + ptr[3] = (digest[2] >> 8) | (digest[3] << 24); + ptr[4] = (digest[3] >> 8); + break; } } else