diff --git a/OpenCL/m07100.cl b/OpenCL/m07100.cl index 1464b0ec1..963dd1651 100644 --- a/OpenCL/m07100.cl +++ b/OpenCL/m07100.cl @@ -5,6 +5,8 @@ #define _PBKDF2_SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,6 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" @@ -44,7 +47,7 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -void sha512_transform (const u64 w[16], u64 dgst[8]) +void sha512_transform_S (const u64 w[16], u64 dgst[8]) { u64 a = dgst[0]; u64 b = dgst[1]; @@ -72,6 +75,197 @@ void sha512_transform (const u64 w[16], u64 dgst[8]) u64 we_t = w[14]; u64 wf_t = w[15]; + #define ROUND_EXPAND() \ + { \ + w0_t = SHA512_EXPAND_S (we_t, w9_t, w1_t, w0_t); \ + w1_t = SHA512_EXPAND_S (wf_t, wa_t, w2_t, w1_t); \ + w2_t = SHA512_EXPAND_S (w0_t, wb_t, w3_t, w2_t); \ + w3_t = SHA512_EXPAND_S (w1_t, wc_t, w4_t, w3_t); \ + w4_t = SHA512_EXPAND_S (w2_t, wd_t, w5_t, w4_t); \ + w5_t = SHA512_EXPAND_S (w3_t, we_t, w6_t, w5_t); \ + w6_t = SHA512_EXPAND_S (w4_t, wf_t, w7_t, w6_t); \ + w7_t = SHA512_EXPAND_S (w5_t, w0_t, w8_t, w7_t); \ + w8_t = SHA512_EXPAND_S (w6_t, w1_t, w9_t, w8_t); \ + w9_t = SHA512_EXPAND_S (w7_t, w2_t, wa_t, w9_t); \ + wa_t = SHA512_EXPAND_S (w8_t, w3_t, wb_t, wa_t); \ + wb_t = SHA512_EXPAND_S (w9_t, w4_t, wc_t, wb_t); \ + wc_t = SHA512_EXPAND_S (wa_t, w5_t, wd_t, wc_t); \ + wd_t = SHA512_EXPAND_S (wb_t, w6_t, we_t, wd_t); \ + we_t = SHA512_EXPAND_S (wc_t, w7_t, wf_t, we_t); \ + wf_t = SHA512_EXPAND_S (wd_t, w8_t, w0_t, wf_t); \ + } + + #define ROUND_STEP(i) \ + { \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha512[i + 0]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha512[i + 1]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha512[i + 2]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha512[i + 3]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha512[i + 4]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha512[i + 5]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha512[i + 6]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha512[i + 7]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha512[i + 8]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha512[i + 9]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha512[i + 10]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha512[i + 11]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha512[i + 12]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha512[i + 13]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, c, d, e, f, g, h, a, b, we_t, k_sha512[i + 14]); \ + SHA512_STEP_S (SHA512_F0o, SHA512_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha512[i + 15]); \ + } + + ROUND_STEP (0); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } + + dgst[0] += a; + dgst[1] += b; + dgst[2] += c; + dgst[3] += d; + dgst[4] += e; + dgst[5] += f; + dgst[6] += g; + dgst[7] += h; +} + +void hmac_sha512_run_S (const u64 w1[16], const u64 ipad[8], const u64 opad[8], u64 dgst[8]) +{ + dgst[0] = ipad[0]; + dgst[1] = ipad[1]; + dgst[2] = ipad[2]; + dgst[3] = ipad[3]; + dgst[4] = ipad[4]; + dgst[5] = ipad[5]; + dgst[6] = ipad[6]; + dgst[7] = ipad[7]; + + sha512_transform_S (w1, dgst); + + u64 w[16]; + + w[ 0] = dgst[0]; + w[ 1] = dgst[1]; + w[ 2] = dgst[2]; + w[ 3] = dgst[3]; + w[ 4] = dgst[4]; + w[ 5] = dgst[5]; + w[ 6] = dgst[6]; + w[ 7] = dgst[7]; + w[ 8] = 0x8000000000000000; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = (128 + 64) * 8; + + dgst[0] = opad[0]; + dgst[1] = opad[1]; + dgst[2] = opad[2]; + dgst[3] = opad[3]; + dgst[4] = opad[4]; + dgst[5] = opad[5]; + dgst[6] = opad[6]; + dgst[7] = opad[7]; + + sha512_transform_S (w, dgst); +} + +void hmac_sha512_init_S (u64 w[16], u64 ipad[8], u64 opad[8]) +{ + w[ 0] ^= 0x3636363636363636; + w[ 1] ^= 0x3636363636363636; + w[ 2] ^= 0x3636363636363636; + w[ 3] ^= 0x3636363636363636; + w[ 4] ^= 0x3636363636363636; + w[ 5] ^= 0x3636363636363636; + w[ 6] ^= 0x3636363636363636; + w[ 7] ^= 0x3636363636363636; + w[ 8] ^= 0x3636363636363636; + w[ 9] ^= 0x3636363636363636; + w[10] ^= 0x3636363636363636; + w[11] ^= 0x3636363636363636; + w[12] ^= 0x3636363636363636; + w[13] ^= 0x3636363636363636; + w[14] ^= 0x3636363636363636; + w[15] ^= 0x3636363636363636; + + ipad[0] = SHA512M_A; + ipad[1] = SHA512M_B; + ipad[2] = SHA512M_C; + ipad[3] = SHA512M_D; + ipad[4] = SHA512M_E; + ipad[5] = SHA512M_F; + ipad[6] = SHA512M_G; + ipad[7] = SHA512M_H; + + sha512_transform_S (w, ipad); + + w[ 0] ^= 0x6a6a6a6a6a6a6a6a; + w[ 1] ^= 0x6a6a6a6a6a6a6a6a; + w[ 2] ^= 0x6a6a6a6a6a6a6a6a; + w[ 3] ^= 0x6a6a6a6a6a6a6a6a; + w[ 4] ^= 0x6a6a6a6a6a6a6a6a; + w[ 5] ^= 0x6a6a6a6a6a6a6a6a; + w[ 6] ^= 0x6a6a6a6a6a6a6a6a; + w[ 7] ^= 0x6a6a6a6a6a6a6a6a; + w[ 8] ^= 0x6a6a6a6a6a6a6a6a; + w[ 9] ^= 0x6a6a6a6a6a6a6a6a; + w[10] ^= 0x6a6a6a6a6a6a6a6a; + w[11] ^= 0x6a6a6a6a6a6a6a6a; + w[12] ^= 0x6a6a6a6a6a6a6a6a; + w[13] ^= 0x6a6a6a6a6a6a6a6a; + w[14] ^= 0x6a6a6a6a6a6a6a6a; + w[15] ^= 0x6a6a6a6a6a6a6a6a; + + opad[0] = SHA512M_A; + opad[1] = SHA512M_B; + opad[2] = SHA512M_C; + opad[3] = SHA512M_D; + opad[4] = SHA512M_E; + opad[5] = SHA512M_F; + opad[6] = SHA512M_G; + opad[7] = SHA512M_H; + + sha512_transform_S (w, opad); +} + +void sha512_transform_V (const u64x w[16], u64x dgst[8]) +{ + u64x a = dgst[0]; + u64x b = dgst[1]; + u64x c = dgst[2]; + u64x d = dgst[3]; + u64x e = dgst[4]; + u64x f = dgst[5]; + u64x g = dgst[6]; + u64x h = dgst[7]; + + u64x w0_t = w[ 0]; + u64x w1_t = w[ 1]; + u64x w2_t = w[ 2]; + u64x w3_t = w[ 3]; + u64x w4_t = w[ 4]; + u64x w5_t = w[ 5]; + u64x w6_t = w[ 6]; + u64x w7_t = w[ 7]; + u64x w8_t = w[ 8]; + u64x w9_t = w[ 9]; + u64x wa_t = w[10]; + u64x wb_t = w[11]; + u64x wc_t = w[12]; + u64x wd_t = w[13]; + u64x we_t = w[14]; + u64x wf_t = w[15]; + #define ROUND_EXPAND() \ { \ w0_t = SHA512_EXPAND (we_t, w9_t, w1_t, w0_t); \ @@ -132,7 +326,7 @@ void sha512_transform (const u64 w[16], u64 dgst[8]) dgst[7] += h; } -void hmac_run (const u64 w1[16], const u64 ipad[8], const u64 opad[8], u64 dgst[8]) +void hmac_sha512_run_V (const u64x w1[16], const u64x ipad[8], const u64x opad[8], u64x dgst[8]) { dgst[0] = ipad[0]; dgst[1] = ipad[1]; @@ -143,9 +337,9 @@ void hmac_run (const u64 w1[16], const u64 ipad[8], const u64 opad[8], u64 dgst[ dgst[6] = ipad[6]; dgst[7] = ipad[7]; - sha512_transform (w1, dgst); + sha512_transform_V (w1, dgst); - u64 w[16]; + u64x w[16]; w[ 0] = dgst[0]; w[ 1] = dgst[1]; @@ -173,10 +367,10 @@ void hmac_run (const u64 w1[16], const u64 ipad[8], const u64 opad[8], u64 dgst[ dgst[6] = opad[6]; dgst[7] = opad[7]; - sha512_transform (w, dgst); + sha512_transform_V (w, dgst); } -void hmac_init (u64 w[16], u64 ipad[8], u64 opad[8]) +void hmac_sha512_init_V (u64x w[16], u64x ipad[8], u64x opad[8]) { w[ 0] ^= 0x3636363636363636; w[ 1] ^= 0x3636363636363636; @@ -204,7 +398,7 @@ void hmac_init (u64 w[16], u64 ipad[8], u64 opad[8]) ipad[6] = SHA512M_G; ipad[7] = SHA512M_H; - sha512_transform (w, ipad); + sha512_transform_V (w, ipad); w[ 0] ^= 0x6a6a6a6a6a6a6a6a; w[ 1] ^= 0x6a6a6a6a6a6a6a6a; @@ -232,7 +426,7 @@ void hmac_init (u64 w[16], u64 ipad[8], u64 opad[8]) opad[6] = SHA512M_G; opad[7] = SHA512M_H; - sha512_transform (w, opad); + sha512_transform_V (w, opad); } __kernel void m07100_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global pbkdf2_sha512_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global pbkdf2_sha512_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -247,31 +441,31 @@ __kernel void m07100_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 w0[4]; - w0[0] = swap32 (pws[gid].i[ 0]); - w0[1] = swap32 (pws[gid].i[ 1]); - w0[2] = swap32 (pws[gid].i[ 2]); - w0[3] = swap32 (pws[gid].i[ 3]); + w0[0] = swap32_S (pws[gid].i[ 0]); + w0[1] = swap32_S (pws[gid].i[ 1]); + w0[2] = swap32_S (pws[gid].i[ 2]); + w0[3] = swap32_S (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap32 (pws[gid].i[ 4]); - w1[1] = swap32 (pws[gid].i[ 5]); - w1[2] = swap32 (pws[gid].i[ 6]); - w1[3] = swap32 (pws[gid].i[ 7]); + w1[0] = swap32_S (pws[gid].i[ 4]); + w1[1] = swap32_S (pws[gid].i[ 5]); + w1[2] = swap32_S (pws[gid].i[ 6]); + w1[3] = swap32_S (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap32 (pws[gid].i[ 8]); - w2[1] = swap32 (pws[gid].i[ 9]); - w2[2] = swap32 (pws[gid].i[10]); - w2[3] = swap32 (pws[gid].i[11]); + w2[0] = swap32_S (pws[gid].i[ 8]); + w2[1] = swap32_S (pws[gid].i[ 9]); + w2[2] = swap32_S (pws[gid].i[10]); + w2[3] = swap32_S (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap32 (pws[gid].i[12]); - w3[1] = swap32 (pws[gid].i[13]); - w3[2] = swap32 (pws[gid].i[14]); - w3[3] = swap32 (pws[gid].i[15]); + w3[0] = swap32_S (pws[gid].i[12]); + w3[1] = swap32_S (pws[gid].i[13]); + w3[2] = swap32_S (pws[gid].i[14]); + w3[3] = swap32_S (pws[gid].i[15]); /** * salt @@ -281,33 +475,33 @@ __kernel void m07100_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 salt_len = salt_bufs[salt_pos].salt_len; - esalt_buf[ 0] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 0]), swap32 (esalt_bufs[salt_pos].salt_buf[ 1])); - esalt_buf[ 1] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 2]), swap32 (esalt_bufs[salt_pos].salt_buf[ 3])); - esalt_buf[ 2] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 4]), swap32 (esalt_bufs[salt_pos].salt_buf[ 5])); - esalt_buf[ 3] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 6]), swap32 (esalt_bufs[salt_pos].salt_buf[ 7])); - esalt_buf[ 4] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[ 8]), swap32 (esalt_bufs[salt_pos].salt_buf[ 9])); - esalt_buf[ 5] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[10]), swap32 (esalt_bufs[salt_pos].salt_buf[11])); - esalt_buf[ 6] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[12]), swap32 (esalt_bufs[salt_pos].salt_buf[13])); - esalt_buf[ 7] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[14]), swap32 (esalt_bufs[salt_pos].salt_buf[15])); - esalt_buf[ 8] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[16]), swap32 (esalt_bufs[salt_pos].salt_buf[17])); - esalt_buf[ 9] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[18]), swap32 (esalt_bufs[salt_pos].salt_buf[19])); - esalt_buf[10] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[20]), swap32 (esalt_bufs[salt_pos].salt_buf[21])); - esalt_buf[11] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[22]), swap32 (esalt_bufs[salt_pos].salt_buf[23])); - esalt_buf[12] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[24]), swap32 (esalt_bufs[salt_pos].salt_buf[25])); - esalt_buf[13] = hl32_to_64 (swap32 (esalt_bufs[salt_pos].salt_buf[26]), swap32 (esalt_bufs[salt_pos].salt_buf[27])); + esalt_buf[ 0] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[ 0]), swap32_S (esalt_bufs[salt_pos].salt_buf[ 1])); + esalt_buf[ 1] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[ 2]), swap32_S (esalt_bufs[salt_pos].salt_buf[ 3])); + esalt_buf[ 2] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[ 4]), swap32_S (esalt_bufs[salt_pos].salt_buf[ 5])); + esalt_buf[ 3] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[ 6]), swap32_S (esalt_bufs[salt_pos].salt_buf[ 7])); + esalt_buf[ 4] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[ 8]), swap32_S (esalt_bufs[salt_pos].salt_buf[ 9])); + esalt_buf[ 5] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[10]), swap32_S (esalt_bufs[salt_pos].salt_buf[11])); + esalt_buf[ 6] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[12]), swap32_S (esalt_bufs[salt_pos].salt_buf[13])); + esalt_buf[ 7] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[14]), swap32_S (esalt_bufs[salt_pos].salt_buf[15])); + esalt_buf[ 8] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[16]), swap32_S (esalt_bufs[salt_pos].salt_buf[17])); + esalt_buf[ 9] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[18]), swap32_S (esalt_bufs[salt_pos].salt_buf[19])); + esalt_buf[10] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[20]), swap32_S (esalt_bufs[salt_pos].salt_buf[21])); + esalt_buf[11] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[22]), swap32_S (esalt_bufs[salt_pos].salt_buf[23])); + esalt_buf[12] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[24]), swap32_S (esalt_bufs[salt_pos].salt_buf[25])); + esalt_buf[13] = hl32_to_64_S (swap32_S (esalt_bufs[salt_pos].salt_buf[26]), swap32_S (esalt_bufs[salt_pos].salt_buf[27])); esalt_buf[14] = 0; esalt_buf[15] = (128 + salt_len + 4) * 8; u64 w[16]; - w[ 0] = hl32_to_64 (w0[0], w0[1]); - w[ 1] = hl32_to_64 (w0[2], w0[3]); - w[ 2] = hl32_to_64 (w1[0], w1[1]); - w[ 3] = hl32_to_64 (w1[2], w1[3]); - w[ 4] = hl32_to_64 (w2[0], w2[1]); - w[ 5] = hl32_to_64 (w2[2], w2[3]); - w[ 6] = hl32_to_64 (w3[0], w3[1]); - w[ 7] = hl32_to_64 (w3[2], w3[3]); + w[ 0] = hl32_to_64_S (w0[0], w0[1]); + w[ 1] = hl32_to_64_S (w0[2], w0[3]); + w[ 2] = hl32_to_64_S (w1[0], w1[1]); + w[ 3] = hl32_to_64_S (w1[2], w1[3]); + w[ 4] = hl32_to_64_S (w2[0], w2[1]); + w[ 5] = hl32_to_64_S (w2[2], w2[3]); + w[ 6] = hl32_to_64_S (w3[0], w3[1]); + w[ 7] = hl32_to_64_S (w3[2], w3[3]); w[ 8] = 0; w[ 9] = 0; w[10] = 0; @@ -320,7 +514,7 @@ __kernel void m07100_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u64 ipad[8]; u64 opad[8]; - hmac_init (w, ipad, opad); + hmac_sha512_init_S (w, ipad, opad); tmps[gid].ipad[0] = ipad[0]; tmps[gid].ipad[1] = ipad[1]; @@ -344,7 +538,7 @@ __kernel void m07100_init (__global pw_t *pws, __global kernel_rule_t *rules_buf { u64 dgst[8]; - hmac_run (esalt_buf, ipad, opad, dgst); + hmac_sha512_run_S (esalt_buf, ipad, opad, dgst); tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; @@ -372,55 +566,53 @@ __kernel void m07100_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u64 ipad[8]; - - ipad[0] = tmps[gid].ipad[0]; - ipad[1] = tmps[gid].ipad[1]; - ipad[2] = tmps[gid].ipad[2]; - ipad[3] = tmps[gid].ipad[3]; - ipad[4] = tmps[gid].ipad[4]; - ipad[5] = tmps[gid].ipad[5]; - ipad[6] = tmps[gid].ipad[6]; - ipad[7] = tmps[gid].ipad[7]; - - u64 opad[8]; - - opad[0] = tmps[gid].opad[0]; - opad[1] = tmps[gid].opad[1]; - opad[2] = tmps[gid].opad[2]; - opad[3] = tmps[gid].opad[3]; - opad[4] = tmps[gid].opad[4]; - opad[5] = tmps[gid].opad[5]; - opad[6] = tmps[gid].opad[6]; - opad[7] = tmps[gid].opad[7]; + u64x ipad[8]; + u64x opad[8]; + + ipad[0] = pack64v (tmps, ipad, gid, 0); + ipad[1] = pack64v (tmps, ipad, gid, 1); + ipad[2] = pack64v (tmps, ipad, gid, 2); + ipad[3] = pack64v (tmps, ipad, gid, 3); + ipad[4] = pack64v (tmps, ipad, gid, 4); + ipad[5] = pack64v (tmps, ipad, gid, 5); + ipad[6] = pack64v (tmps, ipad, gid, 6); + ipad[7] = pack64v (tmps, ipad, gid, 7); + + opad[0] = pack64v (tmps, opad, gid, 0); + opad[1] = pack64v (tmps, opad, gid, 1); + opad[2] = pack64v (tmps, opad, gid, 2); + opad[3] = pack64v (tmps, opad, gid, 3); + opad[4] = pack64v (tmps, opad, gid, 4); + opad[5] = pack64v (tmps, opad, gid, 5); + opad[6] = pack64v (tmps, opad, gid, 6); + opad[7] = pack64v (tmps, opad, gid, 7); for (u32 i = 0; i < 8; i += 8) { - u64 dgst[8]; - - dgst[0] = tmps[gid].dgst[i + 0]; - dgst[1] = tmps[gid].dgst[i + 1]; - dgst[2] = tmps[gid].dgst[i + 2]; - dgst[3] = tmps[gid].dgst[i + 3]; - dgst[4] = tmps[gid].dgst[i + 4]; - dgst[5] = tmps[gid].dgst[i + 5]; - dgst[6] = tmps[gid].dgst[i + 6]; - dgst[7] = tmps[gid].dgst[i + 7]; - - u64 out[8]; - - out[0] = tmps[gid].out[i + 0]; - out[1] = tmps[gid].out[i + 1]; - out[2] = tmps[gid].out[i + 2]; - out[3] = tmps[gid].out[i + 3]; - out[4] = tmps[gid].out[i + 4]; - out[5] = tmps[gid].out[i + 5]; - out[6] = tmps[gid].out[i + 6]; - out[7] = tmps[gid].out[i + 7]; + u64x dgst[8]; + u64x out[8]; + + dgst[0] = pack64v (tmps, dgst, gid, 0); + dgst[1] = pack64v (tmps, dgst, gid, 1); + dgst[2] = pack64v (tmps, dgst, gid, 2); + dgst[3] = pack64v (tmps, dgst, gid, 3); + dgst[4] = pack64v (tmps, dgst, gid, 4); + dgst[5] = pack64v (tmps, dgst, gid, 5); + dgst[6] = pack64v (tmps, dgst, gid, 6); + dgst[7] = pack64v (tmps, dgst, gid, 7); + + out[0] = pack64v (tmps, out, gid, 0); + out[1] = pack64v (tmps, out, gid, 1); + out[2] = pack64v (tmps, out, gid, 2); + out[3] = pack64v (tmps, out, gid, 3); + out[4] = pack64v (tmps, out, gid, 4); + out[5] = pack64v (tmps, out, gid, 5); + out[6] = pack64v (tmps, out, gid, 6); + out[7] = pack64v (tmps, out, gid, 7); for (u32 j = 0; j < loop_cnt; j++) { - u64 w[16]; + u64x w[16]; w[ 0] = dgst[0]; w[ 1] = dgst[1]; @@ -439,7 +631,7 @@ __kernel void m07100_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf w[14] = 0; w[15] = (128 + 64) * 8; - hmac_run (w, ipad, opad, dgst); + hmac_sha512_run_V (w, ipad, opad, dgst); out[0] ^= dgst[0]; out[1] ^= dgst[1]; @@ -451,23 +643,23 @@ __kernel void m07100_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[7] ^= dgst[7]; } - tmps[gid].dgst[i + 0] = dgst[0]; - tmps[gid].dgst[i + 1] = dgst[1]; - tmps[gid].dgst[i + 2] = dgst[2]; - tmps[gid].dgst[i + 3] = dgst[3]; - tmps[gid].dgst[i + 4] = dgst[4]; - tmps[gid].dgst[i + 5] = dgst[5]; - tmps[gid].dgst[i + 6] = dgst[6]; - tmps[gid].dgst[i + 7] = dgst[7]; - - tmps[gid].out[i + 0] = out[0]; - tmps[gid].out[i + 1] = out[1]; - tmps[gid].out[i + 2] = out[2]; - tmps[gid].out[i + 3] = out[3]; - tmps[gid].out[i + 4] = out[4]; - tmps[gid].out[i + 5] = out[5]; - tmps[gid].out[i + 6] = out[6]; - tmps[gid].out[i + 7] = out[7]; + unpackv (tmps, dgst, gid, 0, dgst[0]); + unpackv (tmps, dgst, gid, 1, dgst[1]); + unpackv (tmps, dgst, gid, 2, dgst[2]); + unpackv (tmps, dgst, gid, 3, dgst[3]); + unpackv (tmps, dgst, gid, 4, dgst[4]); + unpackv (tmps, dgst, gid, 5, dgst[5]); + unpackv (tmps, dgst, gid, 6, dgst[6]); + unpackv (tmps, dgst, gid, 7, dgst[7]); + + unpackv (tmps, out, gid, 0, out[0]); + unpackv (tmps, out, gid, 1, out[1]); + unpackv (tmps, out, gid, 2, out[2]); + unpackv (tmps, out, gid, 3, out[3]); + unpackv (tmps, out, gid, 4, out[4]); + unpackv (tmps, out, gid, 5, out[5]); + unpackv (tmps, out, gid, 6, out[6]); + unpackv (tmps, out, gid, 7, out[7]); } } @@ -486,10 +678,10 @@ __kernel void m07100_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf const u64 a = tmps[gid].out[0]; const u64 b = tmps[gid].out[1]; - const u32 r0 = l32_from_64 (a); - const u32 r1 = h32_from_64 (a); - const u32 r2 = l32_from_64 (b); - const u32 r3 = h32_from_64 (b); + const u32 r0 = l32_from_64_S (a); + const u32 r1 = h32_from_64_S (a); + const u32 r2 = l32_from_64_S (b); + const u32 r3 = h32_from_64_S (b); #define il_pos 0 diff --git a/OpenCL/m10900.cl b/OpenCL/m10900.cl index 0107d84f0..c2f24576d 100644 --- a/OpenCL/m10900.cl +++ b/OpenCL/m10900.cl @@ -5,6 +5,8 @@ #define _PBKDF2_SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -14,9 +16,9 @@ #define DGST_R3 3 #include "include/kernel_functions.c" - #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" @@ -41,7 +43,7 @@ __constant u32 k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) +void sha256_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) { u32 a = digest[0]; u32 b = digest[1]; @@ -69,6 +71,195 @@ void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 we_t = w3[2]; u32 wf_t = w3[3]; + #define ROUND_EXPAND() \ + { \ + w0_t = SHA256_EXPAND_S (we_t, w9_t, w1_t, w0_t); \ + w1_t = SHA256_EXPAND_S (wf_t, wa_t, w2_t, w1_t); \ + w2_t = SHA256_EXPAND_S (w0_t, wb_t, w3_t, w2_t); \ + w3_t = SHA256_EXPAND_S (w1_t, wc_t, w4_t, w3_t); \ + w4_t = SHA256_EXPAND_S (w2_t, wd_t, w5_t, w4_t); \ + w5_t = SHA256_EXPAND_S (w3_t, we_t, w6_t, w5_t); \ + w6_t = SHA256_EXPAND_S (w4_t, wf_t, w7_t, w6_t); \ + w7_t = SHA256_EXPAND_S (w5_t, w0_t, w8_t, w7_t); \ + w8_t = SHA256_EXPAND_S (w6_t, w1_t, w9_t, w8_t); \ + w9_t = SHA256_EXPAND_S (w7_t, w2_t, wa_t, w9_t); \ + wa_t = SHA256_EXPAND_S (w8_t, w3_t, wb_t, wa_t); \ + wb_t = SHA256_EXPAND_S (w9_t, w4_t, wc_t, wb_t); \ + wc_t = SHA256_EXPAND_S (wa_t, w5_t, wd_t, wc_t); \ + wd_t = SHA256_EXPAND_S (wb_t, w6_t, we_t, wd_t); \ + we_t = SHA256_EXPAND_S (wc_t, w7_t, wf_t, we_t); \ + wf_t = SHA256_EXPAND_S (wd_t, w8_t, w0_t, wf_t); \ + } + + #define ROUND_STEP(i) \ + { \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ + SHA256_STEP_S (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ + } + + ROUND_STEP (0); + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 16; i < 64; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; + digest[4] += e; + digest[5] += f; + digest[6] += g; + digest[7] += h; +} + +void hmac_sha256_pad_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) +{ + w0[0] = w0[0] ^ 0x36363636; + w0[1] = w0[1] ^ 0x36363636; + w0[2] = w0[2] ^ 0x36363636; + w0[3] = w0[3] ^ 0x36363636; + w1[0] = w1[0] ^ 0x36363636; + w1[1] = w1[1] ^ 0x36363636; + w1[2] = w1[2] ^ 0x36363636; + w1[3] = w1[3] ^ 0x36363636; + w2[0] = w2[0] ^ 0x36363636; + w2[1] = w2[1] ^ 0x36363636; + w2[2] = w2[2] ^ 0x36363636; + w2[3] = w2[3] ^ 0x36363636; + w3[0] = w3[0] ^ 0x36363636; + w3[1] = w3[1] ^ 0x36363636; + w3[2] = w3[2] ^ 0x36363636; + w3[3] = w3[3] ^ 0x36363636; + + ipad[0] = SHA256M_A; + ipad[1] = SHA256M_B; + ipad[2] = SHA256M_C; + ipad[3] = SHA256M_D; + ipad[4] = SHA256M_E; + ipad[5] = SHA256M_F; + ipad[6] = SHA256M_G; + ipad[7] = SHA256M_H; + + sha256_transform_S (w0, w1, w2, w3, ipad); + + w0[0] = w0[0] ^ 0x6a6a6a6a; + w0[1] = w0[1] ^ 0x6a6a6a6a; + w0[2] = w0[2] ^ 0x6a6a6a6a; + w0[3] = w0[3] ^ 0x6a6a6a6a; + w1[0] = w1[0] ^ 0x6a6a6a6a; + w1[1] = w1[1] ^ 0x6a6a6a6a; + w1[2] = w1[2] ^ 0x6a6a6a6a; + w1[3] = w1[3] ^ 0x6a6a6a6a; + w2[0] = w2[0] ^ 0x6a6a6a6a; + w2[1] = w2[1] ^ 0x6a6a6a6a; + w2[2] = w2[2] ^ 0x6a6a6a6a; + w2[3] = w2[3] ^ 0x6a6a6a6a; + w3[0] = w3[0] ^ 0x6a6a6a6a; + w3[1] = w3[1] ^ 0x6a6a6a6a; + w3[2] = w3[2] ^ 0x6a6a6a6a; + w3[3] = w3[3] ^ 0x6a6a6a6a; + + opad[0] = SHA256M_A; + opad[1] = SHA256M_B; + opad[2] = SHA256M_C; + opad[3] = SHA256M_D; + opad[4] = SHA256M_E; + opad[5] = SHA256M_F; + opad[6] = SHA256M_G; + opad[7] = SHA256M_H; + + sha256_transform_S (w0, w1, w2, w3, opad); +} + +void hmac_sha256_run_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) +{ + digest[0] = ipad[0]; + digest[1] = ipad[1]; + digest[2] = ipad[2]; + digest[3] = ipad[3]; + digest[4] = ipad[4]; + digest[5] = ipad[5]; + digest[6] = ipad[6]; + digest[7] = ipad[7]; + + sha256_transform_S (w0, w1, w2, w3, digest); + + w0[0] = digest[0]; + w0[1] = digest[1]; + w0[2] = digest[2]; + w0[3] = digest[3]; + w1[0] = digest[4]; + w1[1] = digest[5]; + w1[2] = digest[6]; + w1[3] = digest[7]; + w2[0] = 0x80000000; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = (64 + 32) * 8; + + digest[0] = opad[0]; + digest[1] = opad[1]; + digest[2] = opad[2]; + digest[3] = opad[3]; + digest[4] = opad[4]; + digest[5] = opad[5]; + digest[6] = opad[6]; + digest[7] = opad[7]; + + sha256_transform_S (w0, w1, w2, w3, digest); +} + +void sha256_transform_V (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8]) +{ + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; + #define ROUND_EXPAND() \ { \ w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ @@ -129,7 +320,7 @@ void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const digest[7] += h; } -void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) +void hmac_sha256_pad_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -157,7 +348,7 @@ void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u ipad[6] = SHA256M_G; ipad[7] = SHA256M_H; - sha256_transform (w0, w1, w2, w3, ipad); + sha256_transform_V (w0, w1, w2, w3, ipad); w0[0] = w0[0] ^ 0x6a6a6a6a; w0[1] = w0[1] ^ 0x6a6a6a6a; @@ -185,10 +376,10 @@ void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u opad[6] = SHA256M_G; opad[7] = SHA256M_H; - sha256_transform (w0, w1, w2, w3, opad); + sha256_transform_V (w0, w1, w2, w3, opad); } -void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) +void hmac_sha256_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -199,7 +390,7 @@ void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u digest[6] = ipad[6]; digest[7] = ipad[7]; - sha256_transform (w0, w1, w2, w3, digest); + sha256_transform_V (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -227,7 +418,7 @@ void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u digest[6] = opad[6]; digest[7] = opad[7]; - sha256_transform (w0, w1, w2, w3, digest); + sha256_transform_V (w0, w1, w2, w3, digest); } __kernel void m10900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global pbkdf2_sha256_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global pbkdf2_sha256_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -242,31 +433,31 @@ __kernel void m10900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 w0[4]; - w0[0] = swap32 (pws[gid].i[ 0]); - w0[1] = swap32 (pws[gid].i[ 1]); - w0[2] = swap32 (pws[gid].i[ 2]); - w0[3] = swap32 (pws[gid].i[ 3]); + w0[0] = swap32_S (pws[gid].i[ 0]); + w0[1] = swap32_S (pws[gid].i[ 1]); + w0[2] = swap32_S (pws[gid].i[ 2]); + w0[3] = swap32_S (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap32 (pws[gid].i[ 4]); - w1[1] = swap32 (pws[gid].i[ 5]); - w1[2] = swap32 (pws[gid].i[ 6]); - w1[3] = swap32 (pws[gid].i[ 7]); + w1[0] = swap32_S (pws[gid].i[ 4]); + w1[1] = swap32_S (pws[gid].i[ 5]); + w1[2] = swap32_S (pws[gid].i[ 6]); + w1[3] = swap32_S (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap32 (pws[gid].i[ 8]); - w2[1] = swap32 (pws[gid].i[ 9]); - w2[2] = swap32 (pws[gid].i[10]); - w2[3] = swap32 (pws[gid].i[11]); + w2[0] = swap32_S (pws[gid].i[ 8]); + w2[1] = swap32_S (pws[gid].i[ 9]); + w2[2] = swap32_S (pws[gid].i[10]); + w2[3] = swap32_S (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap32 (pws[gid].i[12]); - w3[1] = swap32 (pws[gid].i[13]); - w3[2] = swap32 (pws[gid].i[14]); - w3[3] = swap32 (pws[gid].i[15]); + w3[0] = swap32_S (pws[gid].i[12]); + w3[1] = swap32_S (pws[gid].i[13]); + w3[2] = swap32_S (pws[gid].i[14]); + w3[3] = swap32_S (pws[gid].i[15]); /** * salt @@ -279,27 +470,27 @@ __kernel void m10900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 esalt_buf2[4]; u32 esalt_buf3[4]; - esalt_buf0[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); - esalt_buf0[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); - esalt_buf0[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); - esalt_buf0[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); - esalt_buf1[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); - esalt_buf1[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); - esalt_buf1[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); - esalt_buf1[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); - esalt_buf2[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); - esalt_buf2[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); - esalt_buf2[2] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); - esalt_buf2[3] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); - esalt_buf3[0] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); - esalt_buf3[1] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); + esalt_buf0[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 0]); + esalt_buf0[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 1]); + esalt_buf0[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 2]); + esalt_buf0[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 3]); + esalt_buf1[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 4]); + esalt_buf1[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 5]); + esalt_buf1[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 6]); + esalt_buf1[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 7]); + esalt_buf2[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 8]); + esalt_buf2[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 9]); + esalt_buf2[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[10]); + esalt_buf2[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[11]); + esalt_buf3[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[12]); + esalt_buf3[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[13]); esalt_buf3[2] = 0; esalt_buf3[3] = (64 + salt_len + 4) * 8; u32 ipad[8]; u32 opad[8]; - hmac_sha256_pad (w0, w1, w2, w3, ipad, opad); + hmac_sha256_pad_S (w0, w1, w2, w3, ipad, opad); tmps[gid].ipad[0] = ipad[0]; tmps[gid].ipad[1] = ipad[1]; @@ -323,7 +514,7 @@ __kernel void m10900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf { u32 dgst[8]; - hmac_sha256_run (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); + hmac_sha256_run_S (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; @@ -351,58 +542,56 @@ __kernel void m10900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u32 ipad[8]; - - ipad[0] = tmps[gid].ipad[0]; - ipad[1] = tmps[gid].ipad[1]; - ipad[2] = tmps[gid].ipad[2]; - ipad[3] = tmps[gid].ipad[3]; - ipad[4] = tmps[gid].ipad[4]; - ipad[5] = tmps[gid].ipad[5]; - ipad[6] = tmps[gid].ipad[6]; - ipad[7] = tmps[gid].ipad[7]; - - u32 opad[8]; - - opad[0] = tmps[gid].opad[0]; - opad[1] = tmps[gid].opad[1]; - opad[2] = tmps[gid].opad[2]; - opad[3] = tmps[gid].opad[3]; - opad[4] = tmps[gid].opad[4]; - opad[5] = tmps[gid].opad[5]; - opad[6] = tmps[gid].opad[6]; - opad[7] = tmps[gid].opad[7]; + u32x ipad[8]; + u32x opad[8]; + + ipad[0] = packv (tmps, ipad, gid, 0); + ipad[1] = packv (tmps, ipad, gid, 1); + ipad[2] = packv (tmps, ipad, gid, 2); + ipad[3] = packv (tmps, ipad, gid, 3); + ipad[4] = packv (tmps, ipad, gid, 4); + ipad[5] = packv (tmps, ipad, gid, 5); + ipad[6] = packv (tmps, ipad, gid, 6); + ipad[7] = packv (tmps, ipad, gid, 7); + + opad[0] = packv (tmps, opad, gid, 0); + opad[1] = packv (tmps, opad, gid, 1); + opad[2] = packv (tmps, opad, gid, 2); + opad[3] = packv (tmps, opad, gid, 3); + opad[4] = packv (tmps, opad, gid, 4); + opad[5] = packv (tmps, opad, gid, 5); + opad[6] = packv (tmps, opad, gid, 6); + opad[7] = packv (tmps, opad, gid, 7); for (u32 i = 0; i < 8; i += 8) { - u32 dgst[8]; - - dgst[0] = tmps[gid].dgst[i + 0]; - dgst[1] = tmps[gid].dgst[i + 1]; - dgst[2] = tmps[gid].dgst[i + 2]; - dgst[3] = tmps[gid].dgst[i + 3]; - dgst[4] = tmps[gid].dgst[i + 4]; - dgst[5] = tmps[gid].dgst[i + 5]; - dgst[6] = tmps[gid].dgst[i + 6]; - dgst[7] = tmps[gid].dgst[i + 7]; - - u32 out[8]; - - out[0] = tmps[gid].out[i + 0]; - out[1] = tmps[gid].out[i + 1]; - out[2] = tmps[gid].out[i + 2]; - out[3] = tmps[gid].out[i + 3]; - out[4] = tmps[gid].out[i + 4]; - out[5] = tmps[gid].out[i + 5]; - out[6] = tmps[gid].out[i + 6]; - out[7] = tmps[gid].out[i + 7]; + u32x dgst[8]; + u32x out[8]; + + dgst[0] = packv (tmps, dgst, gid, 0); + dgst[1] = packv (tmps, dgst, gid, 1); + dgst[2] = packv (tmps, dgst, gid, 2); + dgst[3] = packv (tmps, dgst, gid, 3); + dgst[4] = packv (tmps, dgst, gid, 4); + dgst[5] = packv (tmps, dgst, gid, 5); + dgst[6] = packv (tmps, dgst, gid, 6); + dgst[7] = packv (tmps, dgst, gid, 7); + + out[0] = packv (tmps, out, gid, 0); + out[1] = packv (tmps, out, gid, 1); + out[2] = packv (tmps, out, gid, 2); + out[3] = packv (tmps, out, gid, 3); + out[4] = packv (tmps, out, gid, 4); + out[5] = packv (tmps, out, gid, 5); + out[6] = packv (tmps, out, gid, 6); + out[7] = packv (tmps, out, gid, 7); for (u32 j = 0; j < loop_cnt; j++) { - u32 w0[4]; - u32 w1[4]; - u32 w2[4]; - u32 w3[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = dgst[0]; w0[1] = dgst[1]; @@ -421,7 +610,7 @@ __kernel void m10900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = (64 + 32) * 8; - hmac_sha256_run (w0, w1, w2, w3, ipad, opad, dgst); + hmac_sha256_run_V (w0, w1, w2, w3, ipad, opad, dgst); out[0] ^= dgst[0]; out[1] ^= dgst[1]; @@ -433,23 +622,23 @@ __kernel void m10900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[7] ^= dgst[7]; } - tmps[gid].dgst[i + 0] = dgst[0]; - tmps[gid].dgst[i + 1] = dgst[1]; - tmps[gid].dgst[i + 2] = dgst[2]; - tmps[gid].dgst[i + 3] = dgst[3]; - tmps[gid].dgst[i + 4] = dgst[4]; - tmps[gid].dgst[i + 5] = dgst[5]; - tmps[gid].dgst[i + 6] = dgst[6]; - tmps[gid].dgst[i + 7] = dgst[7]; - - tmps[gid].out[i + 0] = out[0]; - tmps[gid].out[i + 1] = out[1]; - tmps[gid].out[i + 2] = out[2]; - tmps[gid].out[i + 3] = out[3]; - tmps[gid].out[i + 4] = out[4]; - tmps[gid].out[i + 5] = out[5]; - tmps[gid].out[i + 6] = out[6]; - tmps[gid].out[i + 7] = out[7]; + unpackv (tmps, dgst, gid, 0, dgst[0]); + unpackv (tmps, dgst, gid, 1, dgst[1]); + unpackv (tmps, dgst, gid, 2, dgst[2]); + unpackv (tmps, dgst, gid, 3, dgst[3]); + unpackv (tmps, dgst, gid, 4, dgst[4]); + unpackv (tmps, dgst, gid, 5, dgst[5]); + unpackv (tmps, dgst, gid, 6, dgst[6]); + unpackv (tmps, dgst, gid, 7, dgst[7]); + + unpackv (tmps, out, gid, 0, out[0]); + unpackv (tmps, out, gid, 1, out[1]); + unpackv (tmps, out, gid, 2, out[2]); + unpackv (tmps, out, gid, 3, out[3]); + unpackv (tmps, out, gid, 4, out[4]); + unpackv (tmps, out, gid, 5, out[5]); + unpackv (tmps, out, gid, 6, out[6]); + unpackv (tmps, out, gid, 7, out[7]); } } diff --git a/OpenCL/m11900.cl b/OpenCL/m11900.cl index 33bcc1bdc..53eec8f86 100644 --- a/OpenCL/m11900.cl +++ b/OpenCL/m11900.cl @@ -5,6 +5,8 @@ #define _PBKDF2_MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -14,14 +16,14 @@ #define DGST_R3 3 #include "include/kernel_functions.c" - #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" -void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { u32 a = digest[0]; u32 b = digest[1]; @@ -45,6 +47,189 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 u32 we_t = w3[2]; u32 wf_t = w3[3]; + MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + + MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + + MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + + MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; +} + +void hmac_md5_pad_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +{ + w0[0] = w0[0] ^ 0x36363636; + w0[1] = w0[1] ^ 0x36363636; + w0[2] = w0[2] ^ 0x36363636; + w0[3] = w0[3] ^ 0x36363636; + w1[0] = w1[0] ^ 0x36363636; + w1[1] = w1[1] ^ 0x36363636; + w1[2] = w1[2] ^ 0x36363636; + w1[3] = w1[3] ^ 0x36363636; + w2[0] = w2[0] ^ 0x36363636; + w2[1] = w2[1] ^ 0x36363636; + w2[2] = w2[2] ^ 0x36363636; + w2[3] = w2[3] ^ 0x36363636; + w3[0] = w3[0] ^ 0x36363636; + w3[1] = w3[1] ^ 0x36363636; + w3[2] = w3[2] ^ 0x36363636; + w3[3] = w3[3] ^ 0x36363636; + + ipad[0] = MD5M_A; + ipad[1] = MD5M_B; + ipad[2] = MD5M_C; + ipad[3] = MD5M_D; + + md5_transform_S (w0, w1, w2, w3, ipad); + + w0[0] = w0[0] ^ 0x6a6a6a6a; + w0[1] = w0[1] ^ 0x6a6a6a6a; + w0[2] = w0[2] ^ 0x6a6a6a6a; + w0[3] = w0[3] ^ 0x6a6a6a6a; + w1[0] = w1[0] ^ 0x6a6a6a6a; + w1[1] = w1[1] ^ 0x6a6a6a6a; + w1[2] = w1[2] ^ 0x6a6a6a6a; + w1[3] = w1[3] ^ 0x6a6a6a6a; + w2[0] = w2[0] ^ 0x6a6a6a6a; + w2[1] = w2[1] ^ 0x6a6a6a6a; + w2[2] = w2[2] ^ 0x6a6a6a6a; + w2[3] = w2[3] ^ 0x6a6a6a6a; + w3[0] = w3[0] ^ 0x6a6a6a6a; + w3[1] = w3[1] ^ 0x6a6a6a6a; + w3[2] = w3[2] ^ 0x6a6a6a6a; + w3[3] = w3[3] ^ 0x6a6a6a6a; + + opad[0] = MD5M_A; + opad[1] = MD5M_B; + opad[2] = MD5M_C; + opad[3] = MD5M_D; + + md5_transform_S (w0, w1, w2, w3, opad); +} + +void hmac_md5_run_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +{ + digest[0] = ipad[0]; + digest[1] = ipad[1]; + digest[2] = ipad[2]; + digest[3] = ipad[3]; + + md5_transform_S (w0, w1, w2, w3, digest); + + w0[0] = digest[0]; + w0[1] = digest[1]; + w0[2] = digest[2]; + w0[3] = digest[3]; + w1[0] = 0x80; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = (64 + 16) * 8; + w3[3] = 0; + + digest[0] = opad[0]; + digest[1] = opad[1]; + digest[2] = opad[2]; + digest[3] = opad[3]; + + md5_transform_S (w0, w1, w2, w3, digest); +} + +void md5_transform_V (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) +{ + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; + MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); @@ -119,7 +304,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +void hmac_md5_pad_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -143,7 +328,7 @@ void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 ipad[2] = MD5M_C; ipad[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, ipad); + md5_transform_V (w0, w1, w2, w3, ipad); w0[0] = w0[0] ^ 0x6a6a6a6a; w0[1] = w0[1] ^ 0x6a6a6a6a; @@ -167,17 +352,17 @@ void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[2] = MD5M_C; opad[3] = MD5M_D; - md5_transform (w0, w1, w2, w3, opad); + md5_transform_V (w0, w1, w2, w3, opad); } -void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +void hmac_md5_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; digest[2] = ipad[2]; digest[3] = ipad[3]; - md5_transform (w0, w1, w2, w3, digest); + md5_transform_V (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -201,7 +386,7 @@ void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 digest[2] = opad[2]; digest[3] = opad[3]; - md5_transform (w0, w1, w2, w3, digest); + md5_transform_V (w0, w1, w2, w3, digest); } __kernel void m11900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global pbkdf2_md5_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global pbkdf2_md5_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -273,7 +458,7 @@ __kernel void m11900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 ipad[4]; u32 opad[4]; - hmac_md5_pad (w0, w1, w2, w3, ipad, opad); + hmac_md5_pad_S (w0, w1, w2, w3, ipad, opad); tmps[gid].ipad[0] = ipad[0]; tmps[gid].ipad[1] = ipad[1]; @@ -289,7 +474,7 @@ __kernel void m11900_init (__global pw_t *pws, __global kernel_rule_t *rules_buf { u32 dgst[4]; - hmac_md5_run (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); + hmac_md5_run_S (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; @@ -309,42 +494,40 @@ __kernel void m11900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u32 ipad[4]; - - ipad[0] = tmps[gid].ipad[0]; - ipad[1] = tmps[gid].ipad[1]; - ipad[2] = tmps[gid].ipad[2]; - ipad[3] = tmps[gid].ipad[3]; + u32x ipad[4]; + u32x opad[4]; - u32 opad[4]; + ipad[0] = packv (tmps, ipad, gid, 0); + ipad[1] = packv (tmps, ipad, gid, 1); + ipad[2] = packv (tmps, ipad, gid, 2); + ipad[3] = packv (tmps, ipad, gid, 3); - opad[0] = tmps[gid].opad[0]; - opad[1] = tmps[gid].opad[1]; - opad[2] = tmps[gid].opad[2]; - opad[3] = tmps[gid].opad[3]; + opad[0] = packv (tmps, opad, gid, 0); + opad[1] = packv (tmps, opad, gid, 1); + opad[2] = packv (tmps, opad, gid, 2); + opad[3] = packv (tmps, opad, gid, 3); for (u32 i = 0; i < 4; i += 4) { - u32 dgst[4]; - - dgst[0] = tmps[gid].dgst[i + 0]; - dgst[1] = tmps[gid].dgst[i + 1]; - dgst[2] = tmps[gid].dgst[i + 2]; - dgst[3] = tmps[gid].dgst[i + 3]; + u32x dgst[4]; + u32x out[4]; - u32 out[4]; + dgst[0] = packv (tmps, dgst, gid, 0); + dgst[1] = packv (tmps, dgst, gid, 1); + dgst[2] = packv (tmps, dgst, gid, 2); + dgst[3] = packv (tmps, dgst, gid, 3); - out[0] = tmps[gid].out[i + 0]; - out[1] = tmps[gid].out[i + 1]; - out[2] = tmps[gid].out[i + 2]; - out[3] = tmps[gid].out[i + 3]; + out[0] = packv (tmps, out, gid, 0); + out[1] = packv (tmps, out, gid, 1); + out[2] = packv (tmps, out, gid, 2); + out[3] = packv (tmps, out, gid, 3); for (u32 j = 0; j < loop_cnt; j++) { - u32 w0[4]; - u32 w1[4]; - u32 w2[4]; - u32 w3[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = dgst[0]; w0[1] = dgst[1]; @@ -363,7 +546,7 @@ __kernel void m11900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = (64 + 16) * 8; w3[3] = 0; - hmac_md5_run (w0, w1, w2, w3, ipad, opad, dgst); + hmac_md5_run_V (w0, w1, w2, w3, ipad, opad, dgst); out[0] ^= dgst[0]; out[1] ^= dgst[1]; @@ -371,15 +554,15 @@ __kernel void m11900_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[3] ^= dgst[3]; } - tmps[gid].dgst[i + 0] = dgst[0]; - tmps[gid].dgst[i + 1] = dgst[1]; - tmps[gid].dgst[i + 2] = dgst[2]; - tmps[gid].dgst[i + 3] = dgst[3]; + unpackv (tmps, dgst, gid, 0, dgst[0]); + unpackv (tmps, dgst, gid, 1, dgst[1]); + unpackv (tmps, dgst, gid, 2, dgst[2]); + unpackv (tmps, dgst, gid, 3, dgst[3]); - tmps[gid].out[i + 0] = out[0]; - tmps[gid].out[i + 1] = out[1]; - tmps[gid].out[i + 2] = out[2]; - tmps[gid].out[i + 3] = out[3]; + unpackv (tmps, out, gid, 0, out[0]); + unpackv (tmps, out, gid, 1, out[1]); + unpackv (tmps, out, gid, 2, out[2]); + unpackv (tmps, out, gid, 3, out[3]); } } diff --git a/OpenCL/m12000.cl b/OpenCL/m12000.cl index 00b0aac25..ffbcf867d 100644 --- a/OpenCL/m12000.cl +++ b/OpenCL/m12000.cl @@ -5,6 +5,8 @@ #define _PBKDF2_SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -14,14 +16,14 @@ #define DGST_R3 3 #include "include/kernel_functions.c" - #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" -void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +void sha1_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) { u32 A = digest[0]; u32 B = digest[1]; @@ -49,6 +51,223 @@ void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u3 #undef K #define K SHA1C00 + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, w0_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w1_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w2_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w3_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w4_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, w5_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w6_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w7_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w8_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w9_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, wa_t); + SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, wb_t); + SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, wc_t); + SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, wd_t); + SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, we_t); + SHA1_STEP_S (SHA1_F0o, A, B, C, D, E, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F0o, E, A, B, C, D, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F0o, D, E, A, B, C, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F0o, C, D, E, A, B, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F0o, B, C, D, E, A, w3_t); + + #undef K + #define K SHA1C01 + + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w7_t); + + #undef K + #define K SHA1C02 + + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F2o, A, B, C, D, E, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F2o, E, A, B, C, D, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F2o, D, E, A, B, C, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F2o, C, D, E, A, B, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F2o, B, C, D, E, A, wb_t); + + #undef K + #define K SHA1C03 + + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, wf_t); + w0_t = rotl32_S ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w0_t); + w1_t = rotl32_S ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w1_t); + w2_t = rotl32_S ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w2_t); + w3_t = rotl32_S ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w3_t); + w4_t = rotl32_S ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w4_t); + w5_t = rotl32_S ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, w5_t); + w6_t = rotl32_S ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, w6_t); + w7_t = rotl32_S ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, w7_t); + w8_t = rotl32_S ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, w8_t); + w9_t = rotl32_S ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, w9_t); + wa_t = rotl32_S ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wa_t); + wb_t = rotl32_S ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP_S (SHA1_F1, A, B, C, D, E, wb_t); + wc_t = rotl32_S ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP_S (SHA1_F1, E, A, B, C, D, wc_t); + wd_t = rotl32_S ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP_S (SHA1_F1, D, E, A, B, C, wd_t); + we_t = rotl32_S ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP_S (SHA1_F1, C, D, E, A, B, we_t); + wf_t = rotl32_S ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP_S (SHA1_F1, B, C, D, E, A, wf_t); + + digest[0] += A; + digest[1] += B; + digest[2] += C; + digest[3] += D; + digest[4] += E; +} + +void hmac_sha1_pad_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +{ + w0[0] = w0[0] ^ 0x36363636; + w0[1] = w0[1] ^ 0x36363636; + w0[2] = w0[2] ^ 0x36363636; + w0[3] = w0[3] ^ 0x36363636; + w1[0] = w1[0] ^ 0x36363636; + w1[1] = w1[1] ^ 0x36363636; + w1[2] = w1[2] ^ 0x36363636; + w1[3] = w1[3] ^ 0x36363636; + w2[0] = w2[0] ^ 0x36363636; + w2[1] = w2[1] ^ 0x36363636; + w2[2] = w2[2] ^ 0x36363636; + w2[3] = w2[3] ^ 0x36363636; + w3[0] = w3[0] ^ 0x36363636; + w3[1] = w3[1] ^ 0x36363636; + w3[2] = w3[2] ^ 0x36363636; + w3[3] = w3[3] ^ 0x36363636; + + ipad[0] = SHA1M_A; + ipad[1] = SHA1M_B; + ipad[2] = SHA1M_C; + ipad[3] = SHA1M_D; + ipad[4] = SHA1M_E; + + sha1_transform_S (w0, w1, w2, w3, ipad); + + w0[0] = w0[0] ^ 0x6a6a6a6a; + w0[1] = w0[1] ^ 0x6a6a6a6a; + w0[2] = w0[2] ^ 0x6a6a6a6a; + w0[3] = w0[3] ^ 0x6a6a6a6a; + w1[0] = w1[0] ^ 0x6a6a6a6a; + w1[1] = w1[1] ^ 0x6a6a6a6a; + w1[2] = w1[2] ^ 0x6a6a6a6a; + w1[3] = w1[3] ^ 0x6a6a6a6a; + w2[0] = w2[0] ^ 0x6a6a6a6a; + w2[1] = w2[1] ^ 0x6a6a6a6a; + w2[2] = w2[2] ^ 0x6a6a6a6a; + w2[3] = w2[3] ^ 0x6a6a6a6a; + w3[0] = w3[0] ^ 0x6a6a6a6a; + w3[1] = w3[1] ^ 0x6a6a6a6a; + w3[2] = w3[2] ^ 0x6a6a6a6a; + w3[3] = w3[3] ^ 0x6a6a6a6a; + + opad[0] = SHA1M_A; + opad[1] = SHA1M_B; + opad[2] = SHA1M_C; + opad[3] = SHA1M_D; + opad[4] = SHA1M_E; + + sha1_transform_S (w0, w1, w2, w3, opad); +} + +void hmac_sha1_run_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +{ + digest[0] = ipad[0]; + digest[1] = ipad[1]; + digest[2] = ipad[2]; + digest[3] = ipad[3]; + digest[4] = ipad[4]; + + sha1_transform_S (w0, w1, w2, w3, digest); + + w0[0] = digest[0]; + w0[1] = digest[1]; + w0[2] = digest[2]; + w0[3] = digest[3]; + w1[0] = digest[4]; + w1[1] = 0x80000000; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = (64 + 20) * 8; + + digest[0] = opad[0]; + digest[1] = opad[1]; + digest[2] = opad[2]; + digest[3] = opad[3]; + digest[4] = opad[4]; + + sha1_transform_S (w0, w1, w2, w3, digest); +} + +void sha1_transform_V (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) +{ + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; + + #undef K + #define K SHA1C00 + SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t); @@ -149,7 +368,7 @@ void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u3 digest[4] += E; } -void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +void hmac_sha1_pad_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -174,7 +393,7 @@ void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 ipad[3] = SHA1M_D; ipad[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, ipad); + sha1_transform_V (w0, w1, w2, w3, ipad); w0[0] = w0[0] ^ 0x6a6a6a6a; w0[1] = w0[1] ^ 0x6a6a6a6a; @@ -199,10 +418,10 @@ void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[3] = SHA1M_D; opad[4] = SHA1M_E; - sha1_transform (w0, w1, w2, w3, opad); + sha1_transform_V (w0, w1, w2, w3, opad); } -void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +void hmac_sha1_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -210,7 +429,7 @@ void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 digest[3] = ipad[3]; digest[4] = ipad[4]; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_V (w0, w1, w2, w3, digest); w0[0] = digest[0]; w0[1] = digest[1]; @@ -235,7 +454,7 @@ void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 digest[3] = opad[3]; digest[4] = opad[4]; - sha1_transform (w0, w1, w2, w3, digest); + sha1_transform_V (w0, w1, w2, w3, digest); } __kernel void m12000_init (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global pbkdf2_sha1_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global pbkdf2_sha1_t *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -250,31 +469,31 @@ __kernel void m12000_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 w0[4]; - w0[0] = swap32 (pws[gid].i[ 0]); - w0[1] = swap32 (pws[gid].i[ 1]); - w0[2] = swap32 (pws[gid].i[ 2]); - w0[3] = swap32 (pws[gid].i[ 3]); + w0[0] = swap32_S (pws[gid].i[ 0]); + w0[1] = swap32_S (pws[gid].i[ 1]); + w0[2] = swap32_S (pws[gid].i[ 2]); + w0[3] = swap32_S (pws[gid].i[ 3]); u32 w1[4]; - w1[0] = swap32 (pws[gid].i[ 4]); - w1[1] = swap32 (pws[gid].i[ 5]); - w1[2] = swap32 (pws[gid].i[ 6]); - w1[3] = swap32 (pws[gid].i[ 7]); + w1[0] = swap32_S (pws[gid].i[ 4]); + w1[1] = swap32_S (pws[gid].i[ 5]); + w1[2] = swap32_S (pws[gid].i[ 6]); + w1[3] = swap32_S (pws[gid].i[ 7]); u32 w2[4]; - w2[0] = swap32 (pws[gid].i[ 8]); - w2[1] = swap32 (pws[gid].i[ 9]); - w2[2] = swap32 (pws[gid].i[10]); - w2[3] = swap32 (pws[gid].i[11]); + w2[0] = swap32_S (pws[gid].i[ 8]); + w2[1] = swap32_S (pws[gid].i[ 9]); + w2[2] = swap32_S (pws[gid].i[10]); + w2[3] = swap32_S (pws[gid].i[11]); u32 w3[4]; - w3[0] = swap32 (pws[gid].i[12]); - w3[1] = swap32 (pws[gid].i[13]); - w3[2] = swap32 (pws[gid].i[14]); - w3[3] = swap32 (pws[gid].i[15]); + w3[0] = swap32_S (pws[gid].i[12]); + w3[1] = swap32_S (pws[gid].i[13]); + w3[2] = swap32_S (pws[gid].i[14]); + w3[3] = swap32_S (pws[gid].i[15]); /** * salt @@ -287,27 +506,27 @@ __kernel void m12000_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 esalt_buf2[4]; u32 esalt_buf3[4]; - esalt_buf0[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 0]); - esalt_buf0[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 1]); - esalt_buf0[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 2]); - esalt_buf0[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 3]); - esalt_buf1[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 4]); - esalt_buf1[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 5]); - esalt_buf1[2] = swap32 (esalt_bufs[salt_pos].salt_buf[ 6]); - esalt_buf1[3] = swap32 (esalt_bufs[salt_pos].salt_buf[ 7]); - esalt_buf2[0] = swap32 (esalt_bufs[salt_pos].salt_buf[ 8]); - esalt_buf2[1] = swap32 (esalt_bufs[salt_pos].salt_buf[ 9]); - esalt_buf2[2] = swap32 (esalt_bufs[salt_pos].salt_buf[10]); - esalt_buf2[3] = swap32 (esalt_bufs[salt_pos].salt_buf[11]); - esalt_buf3[0] = swap32 (esalt_bufs[salt_pos].salt_buf[12]); - esalt_buf3[1] = swap32 (esalt_bufs[salt_pos].salt_buf[13]); + esalt_buf0[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 0]); + esalt_buf0[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 1]); + esalt_buf0[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 2]); + esalt_buf0[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 3]); + esalt_buf1[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 4]); + esalt_buf1[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 5]); + esalt_buf1[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 6]); + esalt_buf1[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 7]); + esalt_buf2[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 8]); + esalt_buf2[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[ 9]); + esalt_buf2[2] = swap32_S (esalt_bufs[salt_pos].salt_buf[10]); + esalt_buf2[3] = swap32_S (esalt_bufs[salt_pos].salt_buf[11]); + esalt_buf3[0] = swap32_S (esalt_bufs[salt_pos].salt_buf[12]); + esalt_buf3[1] = swap32_S (esalt_bufs[salt_pos].salt_buf[13]); esalt_buf3[2] = 0; esalt_buf3[3] = (64 + salt_len + 4) * 8; u32 ipad[5]; u32 opad[5]; - hmac_sha1_pad (w0, w1, w2, w3, ipad, opad); + hmac_sha1_pad_S (w0, w1, w2, w3, ipad, opad); tmps[gid].ipad[0] = ipad[0]; tmps[gid].ipad[1] = ipad[1]; @@ -325,7 +544,7 @@ __kernel void m12000_init (__global pw_t *pws, __global kernel_rule_t *rules_buf { u32 dgst[5]; - hmac_sha1_run (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); + hmac_sha1_run_S (esalt_buf0, esalt_buf1, esalt_buf2, esalt_buf3, ipad, opad, dgst); tmps[gid].dgst[i + 0] = dgst[0]; tmps[gid].dgst[i + 1] = dgst[1]; @@ -347,46 +566,44 @@ __kernel void m12000_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u32 ipad[5]; - - ipad[0] = tmps[gid].ipad[0]; - ipad[1] = tmps[gid].ipad[1]; - ipad[2] = tmps[gid].ipad[2]; - ipad[3] = tmps[gid].ipad[3]; - ipad[4] = tmps[gid].ipad[4]; + u32x ipad[5]; + u32x opad[5]; - u32 opad[5]; + ipad[0] = packv (tmps, ipad, gid, 0); + ipad[1] = packv (tmps, ipad, gid, 1); + ipad[2] = packv (tmps, ipad, gid, 2); + ipad[3] = packv (tmps, ipad, gid, 3); + ipad[4] = packv (tmps, ipad, gid, 4); - opad[0] = tmps[gid].opad[0]; - opad[1] = tmps[gid].opad[1]; - opad[2] = tmps[gid].opad[2]; - opad[3] = tmps[gid].opad[3]; - opad[4] = tmps[gid].opad[4]; + opad[0] = packv (tmps, opad, gid, 0); + opad[1] = packv (tmps, opad, gid, 1); + opad[2] = packv (tmps, opad, gid, 2); + opad[3] = packv (tmps, opad, gid, 3); + opad[4] = packv (tmps, opad, gid, 4); for (u32 i = 0; i < 5; i += 5) { - u32 dgst[5]; + u32x dgst[5]; + u32x out[5]; - dgst[0] = tmps[gid].dgst[i + 0]; - dgst[1] = tmps[gid].dgst[i + 1]; - dgst[2] = tmps[gid].dgst[i + 2]; - dgst[3] = tmps[gid].dgst[i + 3]; - dgst[4] = tmps[gid].dgst[i + 4]; + dgst[0] = packv (tmps, dgst, gid, 0); + dgst[1] = packv (tmps, dgst, gid, 1); + dgst[2] = packv (tmps, dgst, gid, 2); + dgst[3] = packv (tmps, dgst, gid, 3); + dgst[4] = packv (tmps, dgst, gid, 4); - u32 out[5]; - - out[0] = tmps[gid].out[i + 0]; - out[1] = tmps[gid].out[i + 1]; - out[2] = tmps[gid].out[i + 2]; - out[3] = tmps[gid].out[i + 3]; - out[4] = tmps[gid].out[i + 4]; + out[0] = packv (tmps, out, gid, 0); + out[1] = packv (tmps, out, gid, 1); + out[2] = packv (tmps, out, gid, 2); + out[3] = packv (tmps, out, gid, 3); + out[4] = packv (tmps, out, gid, 4); for (u32 j = 0; j < loop_cnt; j++) { - u32 w0[4]; - u32 w1[4]; - u32 w2[4]; - u32 w3[4]; + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; w0[0] = dgst[0]; w0[1] = dgst[1]; @@ -405,7 +622,7 @@ __kernel void m12000_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = (64 + 20) * 8; - hmac_sha1_run (w0, w1, w2, w3, ipad, opad, dgst); + hmac_sha1_run_V (w0, w1, w2, w3, ipad, opad, dgst); out[0] ^= dgst[0]; out[1] ^= dgst[1]; @@ -414,17 +631,17 @@ __kernel void m12000_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf out[4] ^= dgst[4]; } - tmps[gid].dgst[i + 0] = dgst[0]; - tmps[gid].dgst[i + 1] = dgst[1]; - tmps[gid].dgst[i + 2] = dgst[2]; - tmps[gid].dgst[i + 3] = dgst[3]; - tmps[gid].dgst[i + 4] = dgst[4]; - - tmps[gid].out[i + 0] = out[0]; - tmps[gid].out[i + 1] = out[1]; - tmps[gid].out[i + 2] = out[2]; - tmps[gid].out[i + 3] = out[3]; - tmps[gid].out[i + 4] = out[4]; + unpackv (tmps, dgst, gid, 0, dgst[0]); + unpackv (tmps, dgst, gid, 1, dgst[1]); + unpackv (tmps, dgst, gid, 2, dgst[2]); + unpackv (tmps, dgst, gid, 3, dgst[3]); + unpackv (tmps, dgst, gid, 4, dgst[4]); + + unpackv (tmps, out, gid, 0, out[0]); + unpackv (tmps, out, gid, 1, out[1]); + unpackv (tmps, out, gid, 2, out[2]); + unpackv (tmps, out, gid, 3, out[3]); + unpackv (tmps, out, gid, 4, out[4]); } } diff --git a/OpenCL/simd.c b/OpenCL/simd.c index 865819dc0..9e87b2c6a 100644 --- a/OpenCL/simd.c +++ b/OpenCL/simd.c @@ -1251,6 +1251,30 @@ inline u32x ix_create_combt (__global comb_t *combs_buf, const u32 il_pos, const #define packvf(arr,var,gid) (u32x) ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var) #endif +#if VECT_SIZE == 1 +#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 1) + 0].var[(idx)]) +#elif VECT_SIZE == 2 +#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 2) + 0].var[(idx)], (arr)[((gid) * 2) + 1].var[(idx)]) +#elif VECT_SIZE == 4 +#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)]) +#elif VECT_SIZE == 8 +#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)]) +#elif VECT_SIZE == 16 +#define pack64v(arr,var,gid,idx) (u64x) ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)]) +#endif + +#if VECT_SIZE == 1 +#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 1) + 0].var) +#elif VECT_SIZE == 2 +#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 2) + 0].var, (arr)[((gid) * 2) + 1].var) +#elif VECT_SIZE == 4 +#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 4) + 0].var, (arr)[((gid) * 4) + 1].var, (arr)[((gid) * 4) + 2].var, (arr)[((gid) * 4) + 3].var) +#elif VECT_SIZE == 8 +#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 8) + 0].var, (arr)[((gid) * 8) + 1].var, (arr)[((gid) * 8) + 2].var, (arr)[((gid) * 8) + 3].var, (arr)[((gid) * 8) + 4].var, (arr)[((gid) * 8) + 5].var, (arr)[((gid) * 8) + 6].var, (arr)[((gid) * 8) + 7].var) +#elif VECT_SIZE == 16 +#define pack64vf(arr,var,gid) (u64x) ((arr)[((gid) * 16) + 0].var, (arr)[((gid) * 16) + 1].var, (arr)[((gid) * 16) + 2].var, (arr)[((gid) * 16) + 3].var, (arr)[((gid) * 16) + 4].var, (arr)[((gid) * 16) + 5].var, (arr)[((gid) * 16) + 6].var, (arr)[((gid) * 16) + 7].var, (arr)[((gid) * 16) + 8].var, (arr)[((gid) * 16) + 9].var, (arr)[((gid) * 16) + 10].var, (arr)[((gid) * 16) + 11].var, (arr)[((gid) * 16) + 12].var, (arr)[((gid) * 16) + 13].var, (arr)[((gid) * 16) + 14].var, (arr)[((gid) * 16) + 15].var) +#endif + #if VECT_SIZE == 1 #define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val; #elif VECT_SIZE == 2 diff --git a/include/kernel_functions.c b/include/kernel_functions.c index 0c70f5eff..0ccbb2534 100644 --- a/include/kernel_functions.c +++ b/include/kernel_functions.c @@ -197,6 +197,11 @@ #define SHIFT_RIGHT_32(x,n) ((x) >> (n)) +#define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u)) +#define SHA256_S1_S(x) (rotl32_S ((x), 15u) ^ rotl32_S ((x), 13u) ^ SHIFT_RIGHT_32 ((x), 10u)) +#define SHA256_S2_S(x) (rotl32_S ((x), 30u) ^ rotl32_S ((x), 19u) ^ rotl32_S ((x), 10u)) +#define SHA256_S3_S(x) (rotl32_S ((x), 26u) ^ rotl32_S ((x), 21u) ^ rotl32_S ((x), 7u)) + #define SHA256_S0(x) (rotl32 ((x), 25u) ^ rotl32 ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u)) #define SHA256_S1(x) (rotl32 ((x), 15u) ^ rotl32 ((x), 13u) ^ SHIFT_RIGHT_32 ((x), 10u)) #define SHA256_S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u)) @@ -223,15 +228,28 @@ #define SHA256_F1o(x,y,z) (SHA256_F1 ((x), (y), (z))) #endif -#define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ -{ \ - h += K; \ - h += x; \ - h += SHA256_S3 (e); \ - h += F1 (e,f,g); \ - d += h; \ - h += SHA256_S2 (a); \ - h += F0 (a,b,c); \ +#define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \ +{ \ + h += K; \ + h += x; \ + h += SHA256_S3_S (e); \ + h += F1 (e,f,g); \ + d += h; \ + h += SHA256_S2_S (a); \ + h += F0 (a,b,c); \ +} + +#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w) + +#define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ +{ \ + h += K; \ + h += x; \ + h += SHA256_S3 (e); \ + h += F1 (e,f,g); \ + d += h; \ + h += SHA256_S2 (a); \ + h += F0 (a,b,c); \ } #define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w) @@ -283,6 +301,11 @@ #define SHIFT_RIGHT_64(x,n) ((x) >> (n)) +#define SHA512_S0_S(x) (rotr64_S ((x), 28) ^ rotr64_S ((x), 34) ^ rotr64_S ((x), 39)) +#define SHA512_S1_S(x) (rotr64_S ((x), 14) ^ rotr64_S ((x), 18) ^ rotr64_S ((x), 41)) +#define SHA512_S2_S(x) (rotr64_S ((x), 1) ^ rotr64_S ((x), 8) ^ SHIFT_RIGHT_64 ((x), 7)) +#define SHA512_S3_S(x) (rotr64_S ((x), 19) ^ rotr64_S ((x), 61) ^ SHIFT_RIGHT_64 ((x), 6)) + #define SHA512_S0(x) (rotr64 ((x), 28) ^ rotr64 ((x), 34) ^ rotr64 ((x), 39)) #define SHA512_S1(x) (rotr64 ((x), 14) ^ rotr64 ((x), 18) ^ rotr64 ((x), 41)) #define SHA512_S2(x) (rotr64 ((x), 1) ^ rotr64 ((x), 8) ^ SHIFT_RIGHT_64 ((x), 7)) @@ -306,6 +329,19 @@ #define SHA512_F1o(x,y,z) (SHA512_F1 ((x), (y), (z))) #endif +#define SHA512_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \ +{ \ + h += K; \ + h += x; \ + h += SHA512_S1_S (e); \ + h += F0 (e, f, g); \ + d += h; \ + h += SHA512_S0_S (a); \ + h += F1 (a, b, c); \ +} + +#define SHA512_EXPAND_S(x,y,z,w) (SHA512_S3_S (x) + y + SHA512_S2_S (z) + w) + #define SHA512_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ h += K; \ diff --git a/src/hashcat.c b/src/hashcat.c index 220dff0be..1e7393e99 100644 --- a/src/hashcat.c +++ b/src/hashcat.c @@ -8992,7 +8992,8 @@ int main (int argc, char **argv) parse_func = sha512osx_parse_hash; sort_by_digest = sort_by_digest_8_16; opti_type = OPTI_TYPE_ZERO_BYTE - | OPTI_TYPE_USES_BITS_64; + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9008,7 +9009,8 @@ int main (int argc, char **argv) parse_func = sha512grub_parse_hash; sort_by_digest = sort_by_digest_8_16; opti_type = OPTI_TYPE_ZERO_BYTE - | OPTI_TYPE_USES_BITS_64; + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9354,7 +9356,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_32; parse_func = cisco8_parse_hash; sort_by_digest = sort_by_digest_4_32; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9561,7 +9564,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_32; parse_func = djangopbkdf2_parse_hash; sort_by_digest = sort_by_digest_4_32; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9757,7 +9761,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_32; parse_func = pbkdf2_sha256_parse_hash; sort_by_digest = sort_by_digest_4_32; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9929,7 +9934,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_32; parse_func = pbkdf2_md5_parse_hash; sort_by_digest = sort_by_digest_4_32; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9946,7 +9952,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_32; parse_func = pbkdf2_sha1_parse_hash; sort_by_digest = sort_by_digest_4_32; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9964,7 +9971,8 @@ int main (int argc, char **argv) parse_func = pbkdf2_sha512_parse_hash; sort_by_digest = sort_by_digest_8_16; opti_type = OPTI_TYPE_ZERO_BYTE - | OPTI_TYPE_USES_BITS_64; + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2;