diff --git a/.gitmodules b/.gitmodules index 0d14e5c45..2bba20905 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "deps/OpenCL-Headers/CL"] - path = deps/OpenCL-Headers/CL +[submodule "OpenCL-Headers"] + path = deps/git/OpenCL-Headers url = https://github.com/KhronosGroup/OpenCL-Headers.git diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index a09b1f1f7..4a8fd0801 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -30952,6 +30952,18 @@ DECLSPEC void append_0x01_2x4_S (u32 *w0, u32 *w1, const u32 offset) append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); } +DECLSPEC void append_0x06_2x4_S (u32 *w0, u32 *w1, const u32 offset) +{ + u32 v[4]; + + set_mark_1x4_S (v, offset); + + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x06060606 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x06060606 : 0), v); +} + DECLSPEC void append_0x80_1x4_S (u32 *w0, const u32 offset) { u32 v[4]; @@ -60122,6 +60134,64 @@ DECLSPEC void append_0x01_2x4_VV (u32x *w0, u32x *w1, const u32x offset) #endif } +DECLSPEC void append_0x06_2x4_VV (u32x *w0, u32x *w1, const u32x offset) +{ + #if VECT_SIZE == 1 + + append_0x06_2x4_S (w0, w1, offset); + + #else + + u32 t0[4]; + u32 t1[4]; + + #endif + + #if VECT_SIZE == 2 + + PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); + PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); + + #elif VECT_SIZE == 4 + + PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); + PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); + PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); + PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); + + #elif VECT_SIZE == 8 + + PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); + PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); + PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); + PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); + PACKVS24 (t0, t1, w0, w1, 4); append_0x06_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); + PACKVS24 (t0, t1, w0, w1, 5); append_0x06_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); + PACKVS24 (t0, t1, w0, w1, 6); append_0x06_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); + PACKVS24 (t0, t1, w0, w1, 7); append_0x06_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); + + #elif VECT_SIZE == 16 + + PACKVS24 (t0, t1, w0, w1, 0); append_0x06_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); + PACKVS24 (t0, t1, w0, w1, 1); append_0x06_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); + PACKVS24 (t0, t1, w0, w1, 2); append_0x06_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); + PACKVS24 (t0, t1, w0, w1, 3); append_0x06_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); + PACKVS24 (t0, t1, w0, w1, 4); append_0x06_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); + PACKVS24 (t0, t1, w0, w1, 5); append_0x06_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); + PACKVS24 (t0, t1, w0, w1, 6); append_0x06_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); + PACKVS24 (t0, t1, w0, w1, 7); append_0x06_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); + PACKVS24 (t0, t1, w0, w1, 8); append_0x06_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8); + PACKVS24 (t0, t1, w0, w1, 9); append_0x06_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9); + PACKVS24 (t0, t1, w0, w1, a); append_0x06_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a); + PACKVS24 (t0, t1, w0, w1, b); append_0x06_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b); + PACKVS24 (t0, t1, w0, w1, c); append_0x06_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c); + PACKVS24 (t0, t1, w0, w1, d); append_0x06_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d); + PACKVS24 (t0, t1, w0, w1, e); append_0x06_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e); + PACKVS24 (t0, t1, w0, w1, f); append_0x06_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f); + + #endif +} + DECLSPEC void append_0x80_2x4_VV (u32x *w0, u32x *w1, const u32x offset) { #if VECT_SIZE == 1 diff --git a/OpenCL/m17300_a0-optimized.cl b/OpenCL/m17300_a0-optimized.cl new file mode 100644 index 000000000..0e53c7256 --- /dev/null +++ b/OpenCL/m17300_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17300_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17300_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17300_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17300_a1-optimized.cl b/OpenCL/m17300_a1-optimized.cl new file mode 100644 index 000000000..2f3d1b305 --- /dev/null +++ b/OpenCL/m17300_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17300_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17300_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17300_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17300_a3-optimized.cl b/OpenCL/m17300_a3-optimized.cl new file mode 100644 index 000000000..d74ae7204 --- /dev/null +++ b/OpenCL/m17300_a3-optimized.cl @@ -0,0 +1,695 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17300m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17300s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17300_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17300_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17300_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17300_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17300_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17300_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m17400_a0-optimized.cl b/OpenCL/m17400_a0-optimized.cl new file mode 100644 index 000000000..6d70475d8 --- /dev/null +++ b/OpenCL/m17400_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17400_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17400_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17400_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17400_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17400_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17400_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m05000_a1-optimized.cl b/OpenCL/m17400_a1-optimized.cl similarity index 98% rename from OpenCL/m05000_a1-optimized.cl rename to OpenCL/m17400_a1-optimized.cl index 39c370c52..2cae24c29 100644 --- a/OpenCL/m05000_a1-optimized.cl +++ b/OpenCL/m17400_a1-optimized.cl @@ -28,7 +28,7 @@ __constant u64a keccakf_rndc[24] = #define KECCAK_ROUNDS 24 #endif -__kernel void m05000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * modifier @@ -297,15 +297,15 @@ __kernel void m05000_m04 (__global pw_t *pws, __global const kernel_rule_t *rule } -__kernel void m05000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * modifier @@ -585,10 +585,10 @@ __kernel void m05000_s04 (__global pw_t *pws, __global const kernel_rule_t *rule } } -__kernel void m05000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } diff --git a/OpenCL/m17400_a3-optimized.cl b/OpenCL/m17400_a3-optimized.cl new file mode 100644 index 000000000..8d52362bc --- /dev/null +++ b/OpenCL/m17400_a3-optimized.cl @@ -0,0 +1,696 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17400m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17400s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17400_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17400_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17400_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17400_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17400_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17400_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m17500_a0-optimized.cl b/OpenCL/m17500_a0-optimized.cl new file mode 100644 index 000000000..794b0f852 --- /dev/null +++ b/OpenCL/m17500_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17500_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17500_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17500_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17500_a1-optimized.cl b/OpenCL/m17500_a1-optimized.cl new file mode 100644 index 000000000..0fac055b6 --- /dev/null +++ b/OpenCL/m17500_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17500_a3-optimized.cl b/OpenCL/m17500_a3-optimized.cl new file mode 100644 index 000000000..3165e8ab5 --- /dev/null +++ b/OpenCL/m17500_a3-optimized.cl @@ -0,0 +1,695 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17500m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17500s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17500_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17500_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17500_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17500_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17500_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17500_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m17600_a0-optimized.cl b/OpenCL/m17600_a0-optimized.cl new file mode 100644 index 000000000..6b7c8dc34 --- /dev/null +++ b/OpenCL/m17600_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17600_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17600_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x06_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17600_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17600_a1-optimized.cl b/OpenCL/m17600_a1-optimized.cl new file mode 100644 index 000000000..395c37b7f --- /dev/null +++ b/OpenCL/m17600_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17600_a3-optimized.cl b/OpenCL/m17600_a3-optimized.cl new file mode 100644 index 000000000..866f13ee2 --- /dev/null +++ b/OpenCL/m17600_a3-optimized.cl @@ -0,0 +1,696 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17600m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17600s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17600_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17600_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17600_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17600_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17600_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17600_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m17700_a0-optimized.cl b/OpenCL/m17700_a0-optimized.cl new file mode 100644 index 000000000..74f05db8c --- /dev/null +++ b/OpenCL/m17700_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17700_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17700_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17700_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17700_a1-optimized.cl b/OpenCL/m17700_a1-optimized.cl new file mode 100644 index 000000000..5a7d5a42a --- /dev/null +++ b/OpenCL/m17700_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17700_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17700_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17700_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17700_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17700_a3-optimized.cl b/OpenCL/m17700_a3-optimized.cl new file mode 100644 index 000000000..8f861eef6 --- /dev/null +++ b/OpenCL/m17700_a3-optimized.cl @@ -0,0 +1,695 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17300m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17300s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0x8000000000000000; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = 0; + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17700_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17700_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17700_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17700_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17700_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17700_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17300s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m05000_a0-optimized.cl b/OpenCL/m17800_a0-optimized.cl similarity index 97% rename from OpenCL/m05000_a0-optimized.cl rename to OpenCL/m17800_a0-optimized.cl index 260e40324..5e6c1041f 100644 --- a/OpenCL/m05000_a0-optimized.cl +++ b/OpenCL/m17800_a0-optimized.cl @@ -30,7 +30,7 @@ __constant u64a keccakf_rndc[24] = #define KECCAK_ROUNDS 24 #endif -__kernel void m05000_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * modifier @@ -240,15 +240,15 @@ __kernel void m05000_m04 (__global pw_t *pws, __constant const kernel_rule_t *ru } } -__kernel void m05000_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * modifier @@ -470,10 +470,10 @@ __kernel void m05000_s04 (__global pw_t *pws, __constant const kernel_rule_t *ru } } -__kernel void m05000_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } -__kernel void m05000_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { } diff --git a/OpenCL/m17800_a1-optimized.cl b/OpenCL/m17800_a1-optimized.cl new file mode 100644 index 000000000..d4a24e427 --- /dev/null +++ b/OpenCL/m17800_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17800_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17800_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17800_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0x8000000000000000; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17800_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17800_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m05000_a3-optimized.cl b/OpenCL/m17800_a3-optimized.cl similarity index 96% rename from OpenCL/m05000_a3-optimized.cl rename to OpenCL/m17800_a3-optimized.cl index e0c885838..92ead0259 100644 --- a/OpenCL/m05000_a3-optimized.cl +++ b/OpenCL/m17800_a3-optimized.cl @@ -28,7 +28,7 @@ __constant u64a keccakf_rndc[24] = #define KECCAK_ROUNDS 24 #endif -DECLSPEC void m05000m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +DECLSPEC void m17400m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -214,7 +214,7 @@ DECLSPEC void m05000m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __g } } -DECLSPEC void m05000s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +DECLSPEC void m17400s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -412,7 +412,7 @@ DECLSPEC void m05000s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __g } } -__kernel void m05000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -456,10 +456,10 @@ __kernel void m05000_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m05000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -503,10 +503,10 @@ __kernel void m05000_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m05000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -550,10 +550,10 @@ __kernel void m05000_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m05000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -597,10 +597,10 @@ __kernel void m05000_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m05000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -644,10 +644,10 @@ __kernel void m05000_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m05000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +__kernel void m17800_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -691,5 +691,5 @@ __kernel void m05000_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * main */ - m05000s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); + m17400s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } diff --git a/OpenCL/m17900_a0-optimized.cl b/OpenCL/m17900_a0-optimized.cl new file mode 100644 index 000000000..35e8acd12 --- /dev/null +++ b/OpenCL/m17900_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17900_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17900_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17900_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17900_a1-optimized.cl b/OpenCL/m17900_a1-optimized.cl new file mode 100644 index 000000000..de0748e34 --- /dev/null +++ b/OpenCL/m17900_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m17900_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m17900_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17900_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m17900_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m17900_a3-optimized.cl b/OpenCL/m17900_a3-optimized.cl new file mode 100644 index 000000000..4974e7efc --- /dev/null +++ b/OpenCL/m17900_a3-optimized.cl @@ -0,0 +1,695 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17500m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17500s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0x8000000000000000; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m17900_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17900_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17900_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17900_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17900_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m17900_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17500s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/OpenCL/m18000_a0-optimized.cl b/OpenCL/m18000_a0-optimized.cl new file mode 100644 index 000000000..3ac3ac67b --- /dev/null +++ b/OpenCL/m18000_a0-optimized.cl @@ -0,0 +1,479 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_rp_optimized.h" +#include "inc_rp_optimized.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m18000_m04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m18000_m08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_m16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_s04 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; + + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); + + append_0x01_2x4_VV (w0, w1, out_len); + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = 0; + u64x a10 = 0; + u64x a11 = 0; + u64x a12 = 0; + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m18000_s08 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_s16 (__global pw_t *pws, __constant const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m18000_a1-optimized.cl b/OpenCL/m18000_a1-optimized.cl new file mode 100644 index 000000000..489c68a35 --- /dev/null +++ b/OpenCL/m18000_a1-optimized.cl @@ -0,0 +1,594 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +__kernel void m18000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + + +__kernel void m18000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * modifier + */ + + const u64 lid = get_local_id (0); + + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 pw_buf0[4]; + u32 pw_buf1[4]; + + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; + + const u32 pw_l_len = pws[gid].pw_len; + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); + + const u32x pw_len = pw_l_len + pw_r_len; + + /** + * concat password candidate + */ + + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); + + if (combs_mode == COMBINATOR_MODE_BASE_LEFT) + { + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + w0[2] = wordl0[2] | wordr0[2]; + w0[3] = wordl0[3] | wordr0[3]; + w1[0] = wordl1[0] | wordr1[0]; + w1[1] = wordl1[1] | wordr1[1]; + w1[2] = wordl1[2] | wordr1[2]; + w1[3] = wordl1[3] | wordr1[3]; + w2[0] = wordl2[0] | wordr2[0]; + w2[1] = wordl2[1] | wordr2[1]; + w2[2] = wordl2[2] | wordr2[2]; + w2[3] = wordl2[3] | wordr2[3]; + w3[0] = wordl3[0] | wordr3[0]; + w3[1] = wordl3[1] | wordr3[1]; + w3[2] = wordl3[2] | wordr3[2]; + w3[3] = wordl3[3] | wordr3[3]; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0[0]); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m18000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} + +__kernel void m18000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ +} diff --git a/OpenCL/m18000_a3-optimized.cl b/OpenCL/m18000_a3-optimized.cl new file mode 100644 index 000000000..ca459c587 --- /dev/null +++ b/OpenCL/m18000_a3-optimized.cl @@ -0,0 +1,695 @@ +/** + * Author......: See docs/credits.txt + * License.....: MIT + */ + +#define NEW_SIMD_CODE + +#include "inc_vendor.cl" +#include "inc_hash_constants.h" +#include "inc_hash_functions.cl" +#include "inc_types.cl" +#include "inc_common.cl" +#include "inc_simd.cl" + +__constant u64a keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +#ifndef KECCAK_ROUNDS +#define KECCAK_ROUNDS 24 +#endif + +DECLSPEC void m17600m (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_M_SIMD (r0, r1, r2, r3); + } +} + +DECLSPEC void m17600s (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +{ + /** + * modifier + */ + + const u64 gid = get_global_id (0); + const u64 lid = get_local_id (0); + + /** + * digest + */ + + const u32 search[4] = + { + digests_buf[digests_offset].digest_buf[DGST_R0], + digests_buf[digests_offset].digest_buf[DGST_R1], + digests_buf[digests_offset].digest_buf[DGST_R2], + digests_buf[digests_offset].digest_buf[DGST_R3] + }; + + /** + * loop + */ + + u32 w0l = w0[0]; + + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) + { + const u32x w0r = ix_create_bft (bfs_buf, il_pos); + + const u32x w0lr = w0l | w0r; + + /** + * Keccak + */ + + u64x a00 = hl32_to_64 (w0[1], w0lr); + u64x a01 = hl32_to_64 (w0[3], w0[2]); + u64x a02 = hl32_to_64 (w1[1], w1[0]); + u64x a03 = hl32_to_64 (w1[3], w1[2]); + u64x a04 = hl32_to_64 (w2[1], w2[0]); + u64x a10 = hl32_to_64 (w2[3], w2[2]); + u64x a11 = hl32_to_64 (w3[1], w3[0]); + u64x a12 = hl32_to_64 (w3[3], w3[2]); + u64x a13 = 0x8000000000000000; + u64x a14 = 0; + u64x a20 = 0; + u64x a21 = 0; + u64x a22 = 0; + u64x a23 = 0; + u64x a24 = 0; + u64x a30 = 0; + u64x a31 = 0; + u64x a32 = 0; + u64x a33 = 0; + u64x a34 = 0; + u64x a40 = 0; + u64x a41 = 0; + u64x a42 = 0; + u64x a43 = 0; + u64x a44 = 0; + + #define Rho_Pi(ad,r) \ + bc0 = ad; \ + ad = rotl64 (t, r); \ + t = bc0; \ + + #ifdef _unroll + #pragma unroll + #endif + for (int round = 0; round < KECCAK_ROUNDS - 1; round++) + { + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; a40 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a01 ^= t; a11 ^= t; a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; a42 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a14 ^= t; a24 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + Rho_Pi (a40, 62); + Rho_Pi (a24, 18); + Rho_Pi (a42, 39); + Rho_Pi (a14, 61); + Rho_Pi (a11, 20); + Rho_Pi (a01, 44); + + // Chi + + bc0 = a00; bc1 = a01; bc2 = a02; bc3 = a03; bc4 = a04; + a00 ^= ~bc1 & bc2; a01 ^= ~bc2 & bc3; a02 ^= ~bc3 & bc4; a03 ^= ~bc4 & bc0; a04 ^= ~bc0 & bc1; + + bc0 = a10; bc1 = a11; bc2 = a12; bc3 = a13; bc4 = a14; + a10 ^= ~bc1 & bc2; a11 ^= ~bc2 & bc3; a12 ^= ~bc3 & bc4; a13 ^= ~bc4 & bc0; a14 ^= ~bc0 & bc1; + + bc0 = a20; bc1 = a21; bc2 = a22; bc3 = a23; bc4 = a24; + a20 ^= ~bc1 & bc2; a21 ^= ~bc2 & bc3; a22 ^= ~bc3 & bc4; a23 ^= ~bc4 & bc0; a24 ^= ~bc0 & bc1; + + bc0 = a30; bc1 = a31; bc2 = a32; bc3 = a33; bc4 = a34; + a30 ^= ~bc1 & bc2; a31 ^= ~bc2 & bc3; a32 ^= ~bc3 & bc4; a33 ^= ~bc4 & bc0; a34 ^= ~bc0 & bc1; + + bc0 = a40; bc1 = a41; bc2 = a42; bc3 = a43; bc4 = a44; + a40 ^= ~bc1 & bc2; a41 ^= ~bc2 & bc3; a42 ^= ~bc3 & bc4; a43 ^= ~bc4 & bc0; a44 ^= ~bc0 & bc1; + + // Iota + + a00 ^= keccakf_rndc[round]; + } + + // Theta + + u64x bc0 = a00 ^ a10 ^ a20 ^ a30 ^ a40; + u64x bc1 = a01 ^ a11 ^ a21 ^ a31 ^ a41; + u64x bc2 = a02 ^ a12 ^ a22 ^ a32 ^ a42; + u64x bc3 = a03 ^ a13 ^ a23 ^ a33 ^ a43; + u64x bc4 = a04 ^ a14 ^ a24 ^ a34 ^ a44; + + u64x t; + + t = bc4 ^ rotl64 (bc1, 1); a00 ^= t; a10 ^= t; a20 ^= t; a30 ^= t; + t = bc0 ^ rotl64 (bc2, 1); a21 ^= t; a31 ^= t; a41 ^= t; + t = bc1 ^ rotl64 (bc3, 1); a02 ^= t; a12 ^= t; a22 ^= t; a32 ^= t; + t = bc2 ^ rotl64 (bc4, 1); a03 ^= t; a13 ^= t; a23 ^= t; a33 ^= t; a43 ^= t; + t = bc3 ^ rotl64 (bc0, 1); a04 ^= t; a34 ^= t; a44 ^= t; + + // Rho Pi + + t = a01; + + Rho_Pi (a20, 1); + Rho_Pi (a12, 3); + Rho_Pi (a21, 6); + Rho_Pi (a32, 10); + Rho_Pi (a33, 15); + Rho_Pi (a03, 21); + Rho_Pi (a10, 28); + Rho_Pi (a31, 36); + Rho_Pi (a13, 45); + Rho_Pi (a41, 55); + Rho_Pi (a44, 2); + Rho_Pi (a04, 14); + Rho_Pi (a30, 27); + Rho_Pi (a43, 41); + Rho_Pi (a34, 56); + Rho_Pi (a23, 8); + Rho_Pi (a22, 25); + Rho_Pi (a02, 43); + + #undef Rho_Pi + + bc0 = a00; + bc2 = a02; + bc3 = a03; + bc4 = a04; + + a02 ^= ~bc3 & bc4; + a03 ^= ~bc4 & bc0; + + const u32x r0 = l32_from_64 (a03); + const u32x r1 = h32_from_64 (a03); + const u32x r2 = l32_from_64 (a02); + const u32x r3 = h32_from_64 (a02); + + COMPARE_S_SIMD (r0, r1, r2, r3); + } +} + +__kernel void m18000_m04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m18000_m08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m18000_m16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m18000_s04 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m18000_s08 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} + +__kernel void m18000_s16 (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) +{ + /** + * base + */ + + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; + + w2[0] = pws[gid].i[ 8]; + w2[1] = pws[gid].i[ 9]; + w2[2] = pws[gid].i[10]; + w2[3] = pws[gid].i[11]; + + u32 w3[4]; + + w3[0] = pws[gid].i[12]; + w3[1] = pws[gid].i[13]; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len; + + /** + * main + */ + + m17600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); +} diff --git a/deps/OpenCL-Headers/CL b/deps/OpenCL-Headers/CL deleted file mode 160000 index bf0f43b76..000000000 --- a/deps/OpenCL-Headers/CL +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bf0f43b76f4556c3d5717f8ba8a01216b27f4af7 diff --git a/deps/git/OpenCL-Headers b/deps/git/OpenCL-Headers new file mode 160000 index 000000000..d51692456 --- /dev/null +++ b/deps/git/OpenCL-Headers @@ -0,0 +1 @@ +Subproject commit d5169245693563d4c69434ba061d92d3d68c4123 diff --git a/docs/changes.txt b/docs/changes.txt index 24d55a490..dc4be4838 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -6,12 +6,26 @@ - Add new option --slow-candidates which allows hashcat to generate passwords on-host +## +## Algorithms +## + +- Added hash-mode 17300 = SHA3-224 +- Added hash-mode 17400 = SHA3-256 +- Added hash-mode 17500 = SHA3-384 +- Added hash-mode 17600 = SHA3-512 +- Added hash-mode 17700 = Keccak-224 +- Added hash-mode 17800 = Keccak-256 +- Added hash-mode 17900 = Keccak-384 +- Added hash-mode 18000 = Keccak-512 +- Removed hash-mode 5000 = SHA-3 (Keccak) + ## ## Improvements ## - Workaround some AMD OpenCL runtime segmentation faults -- Allow bitcoin master key length not be exactly 96 byte a multiple of 16 +- Allow bitcoin master key lengths different from 96 bytes, but they must be always a multiple of 16 - Getting rid of OPTS_TYPE_HASH_COPY for Ansible Vault - Add a tracker for salts, amplifier and iterations to status screen - Add option --markov-hcstat2 to make it clear that the new hcstat2 format (compressed hcstat2gen output) must be used @@ -21,6 +35,7 @@ - Added additional hybrid "passthrough" rules, to enable variable-length append/prepend attacks - Increased the maximum size of edata2 in Kerberos 5 TGS-REP etype 23 - Allow hashfile for -m 16800 to be used with -m 16801 +- Make the masks parser more restrictive by rejecting a single '?' at the end of the mask (use ?? instead) ## ## Bugs diff --git a/docs/readme.txt b/docs/readme.txt index 9837c5ae2..4eedbf761 100644 --- a/docs/readme.txt +++ b/docs/readme.txt @@ -41,11 +41,18 @@ NVIDIA GPUs require "NVIDIA Driver" (367.x or later) - MD5 - Half MD5 - SHA1 -- SHA-224 -- SHA-256 -- SHA-384 -- SHA-512 -- SHA-3 (Keccak) +- SHA2-224 +- SHA2-256 +- SHA2-384 +- SHA2-512 +- SHA3-224 +- SHA3-256 +- SHA3-384 +- SHA3-512 +- Keccak-224 +- Keccak-256 +- Keccak-384 +- Keccak-512 - BLAKE2b-512 - SipHash - RIPEMD-160 diff --git a/extra/tab_completion/hashcat.sh b/extra/tab_completion/hashcat.sh index ecb647671..df833b085 100644 --- a/extra/tab_completion/hashcat.sh +++ b/extra/tab_completion/hashcat.sh @@ -176,7 +176,7 @@ _hashcat () { local VERSION=4.2.1 - local HASH_MODES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 124 130 131 132 133 140 141 150 160 200 300 400 500 501 600 900 1000 1100 1400 1410 1411 1420 1421 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2501 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 3910 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9710 9720 9800 9810 9820 9900 10000 10100 10200 10300 10400 10410 10420 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12001 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16801 16900" + local HASH_MODES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 124 130 131 132 133 140 141 150 160 200 300 400 500 501 600 900 1000 1100 1400 1410 1411 1420 1421 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2501 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 3910 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5100 5200 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9710 9720 9800 9810 9820 9900 10000 10100 10200 10300 10400 10410 10420 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12001 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16801 16900 17300 17400 17500 17600 17700 17800 17900 18000" local ATTACK_MODES="0 1 3 6 7" local HCCAPX_MESSAGE_PAIRS="0 1 2 3 4 5" local OUTFILE_FORMATS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" diff --git a/include/ext_OpenCL.h b/include/ext_OpenCL.h index beb9bbd0d..f572736e3 100644 --- a/include/ext_OpenCL.h +++ b/include/ext_OpenCL.h @@ -6,6 +6,8 @@ #ifndef _EXT_OPENCL_H #define _EXT_OPENCL_H +#define CL_TARGET_OPENCL_VERSION 120 + #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_USE_DEPRECATED_OPENCL_2_0_APIS diff --git a/include/interface.h b/include/interface.h index 92f1ffd07..0ccfd8eb5 100644 --- a/include/interface.h +++ b/include/interface.h @@ -1060,7 +1060,6 @@ typedef enum hash_type HASH_TYPE_ORACLEH = 13, HASH_TYPE_DESRACF = 14, HASH_TYPE_BCRYPT = 15, - HASH_TYPE_KECCAK = 16, HASH_TYPE_NETNTLM = 17, HASH_TYPE_RIPEMD160 = 18, HASH_TYPE_WHIRLPOOL = 19, @@ -1186,7 +1185,6 @@ typedef enum kern_type KERN_TYPE_SHA1_MD5 = 4700, KERN_TYPE_MD5_CHAP = 4800, KERN_TYPE_SHA1_SLT_PW_SLT = 4900, - KERN_TYPE_KECCAK = 5000, KERN_TYPE_MD5H = 5100, KERN_TYPE_PSAFE3 = 5200, KERN_TYPE_IKEPSK_MD5 = 5300, @@ -1325,6 +1323,14 @@ typedef enum kern_type KERN_TYPE_WPA_PMKID_PBKDF2 = 16800, KERN_TYPE_WPA_PMKID_PMK = 16801, KERN_TYPE_ANSIBLE_VAULT = 16900, + KERN_TYPE_SHA3_224 = 17300, + KERN_TYPE_SHA3_256 = 17400, + KERN_TYPE_SHA3_384 = 17500, + KERN_TYPE_SHA3_512 = 17600, + KERN_TYPE_KECCAK_224 = 17700, + KERN_TYPE_KECCAK_256 = 17800, + KERN_TYPE_KECCAK_384 = 17900, + KERN_TYPE_KECCAK_512 = 18000, KERN_TYPE_TOTP_HMACSHA1 = 18100, KERN_TYPE_PLAINTEXT = 99999, @@ -1419,7 +1425,10 @@ int des_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_bu int episerver_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); int postgresql_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); int netscreen_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); -int keccak_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); +int keccak_224_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); +int keccak_256_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); +int keccak_384_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); +int keccak_512_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); int blake2b_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); int chacha20_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); int lm_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig); diff --git a/include/types.h b/include/types.h index 6992ddd96..ade1d28a5 100644 --- a/include/types.h +++ b/include/types.h @@ -403,6 +403,7 @@ typedef enum opts_type OPTS_TYPE_AUX4 = (1ULL << 37), OPTS_TYPE_BINARY_HASHFILE = (1ULL << 38), OPTS_TYPE_PREFERED_THREAD = (1ULL << 39), // some algorithms (complicated ones with many branches) benefit from this + OPTS_TYPE_PT_ADD06 = (1ULL << 40), } opts_type_t; @@ -724,8 +725,6 @@ typedef struct salt u32 salt_iter2; u32 salt_sign[2]; - u32 keccak_mdlen; - u32 digests_cnt; u32 digests_done; diff --git a/src/Makefile b/src/Makefile index 1a870cb54..da9b469ed 100644 --- a/src/Makefile +++ b/src/Makefile @@ -118,7 +118,7 @@ endif # MSYS2 ## You have your own headers somewhere, for example: apt-get install opencl-headers ## -OPENCL_HEADERS_KHRONOS := deps/OpenCL-Headers +OPENCL_HEADERS_KHRONOS := deps/git/OpenCL-Headers ## ## Cross compiler paths diff --git a/src/interface.c b/src/interface.c index 40e4a2b2b..fcd665a34 100644 --- a/src/interface.c +++ b/src/interface.c @@ -137,7 +137,6 @@ static const char *ST_HASH_04522 = "9038129c474caa3f0de56f38db84033d0fe1d4b8:365 static const char *ST_HASH_04700 = "92d85978d884eb1d99a51652b1139c8279fa8663"; static const char *ST_HASH_04800 = "aa4aaa1d52319525023c06a4873f4c51:35343534373533343633383832343736:dc"; static const char *ST_HASH_04900 = "75d280ca9a0c2ee18729603104ead576d9ca6285:347070"; -static const char *ST_HASH_05000 = "203f88777f18bb4ee1226627b547808f38d90d3e106262b5de9ca943b57137b6"; static const char *ST_HASH_05100 = "8743b52063cd8409"; static const char *ST_HASH_05200 = "50575333e4e2a590a5e5c8269f57ec04a8a1c0c03da55b311c51236dab8c6b96b0afca02000800005eaeee20c6cc10d5caa6522b3ca545c41d9133d630ca08f467b7aae8a2bbef51aa2df968d10b9c4cfb17a182c0add7acb8c153794f51337e12f472f451d10e6dcac664ed760606aabdbb6b794a80d6ce2a330100c76de0ff961a45cca21576b893d826c52f272b97cdf48aca6fbe6c74b039f81c61b7d632fb6fddd9f96162ab1effd69a4598a331e855e38792e5365272d4791bf991d248e1585a9ad20ea3d77b5d2ef9a711ef90a70ec6991cb578f1b8bdaa9efa7b0039e9ea96f777491713047bdd99fa1d78f06f23406a66046b387d3034e46b1f84129bba853cc18fa49f107dc0290547258d30566a4b1b363ff4c1c16cb2f5f400059833d4b651bfa508200cbdc7a75fc57ef90eb1d90b0deea8505753332d454f46505753332d454f466236710e2e2477878e738b60d0aa2834a96b01e97764fe980243a06ad16939d1"; static const char *ST_HASH_05300 = "50503326cac6e4bd892b8257805b5a59a285f464ad3f63dc01bd0335f8341ef52e00be0b8cb205422a3788f021e4e6e8ccbe34784bc85abe42f62545bac64888426a2f1264fa28cf384ff00b14cfa5eff562dda4fad2a31fd7a6715218cff959916deed856feea5bee2e773241c5fbebf202958f0ce0c432955e0f1f6d1259da:688a7bfa8d5819630a970ed6d27018021a15fbb3e2fdcc36ce9b563d8ff95f510c4b3236c014d1cde9c2f1a999b121bc3ab1bc8049c8ac1e8c167a84f53c867492723eb01ab4b38074b38f4297d6fea8f44e01ea828fce33c433430938b1551f60673ce8088e7d2f41e3b49315344046fefee1e3860064331417562761db3ba4:c66606d691eaade4:8bdc88a2cdb4a1cf:c3b13137fae9f66684d98709939e5c3454ee31a98c80a1c76427d805b5dea866eff045515e8fb42dd259b9448caba9d937f4b3b75ec1b092a92232b4c8c1e70a60a52076e907f887b731d0f66e19e09b535238169c74c04a4b393f9b815c54eef4558cd8a22c9018bb4f24ee6db0e32979f9a353361cdba948f9027551ee40b1c96ba81c28aa3e1a0fac105dc469efa83f6d3ee281b945c6fa8b4677bac26dda:53f757c5b08afad6:aa02d9289e1702e5d7ed1e4ebf35ab31c2688e00:aab8580015cf545ac0b7291d15a4f2c79e06defd:944a0df3939f3bd281c9d05fbc0e3d30"; @@ -281,6 +280,14 @@ static const char *ST_HASH_16700 = "$fvde$1$16$84286044060108438487434858307513$ static const char *ST_HASH_16800 = "2582a8281bf9d4308d6f5731d0e61c61*4604ba734d4e*89acf0e761f4*ed487162465a774bfba60eb603a39f3a"; static const char *ST_HASH_16801 = "2582a8281bf9d4308d6f5731d0e61c61*4604ba734d4e*89acf0e761f4"; static const char *ST_HASH_16900 = "$ansible$0*0*6b761adc6faeb0cc0bf197d3d4a4a7d3f1682e4b169cae8fa6b459b3214ed41e*426d313c5809d4a80a4b9bc7d4823070*d8bad190c7fbc7c3cb1c60a27abfb0ff59d6fb73178681c7454d94a0f56a4360"; +static const char *ST_HASH_17300 = "412ef78534ba6ab0e9b1607d3e9767a25c1ea9d5e83176b4c2817a6c"; +static const char *ST_HASH_17400 = "d60fcf6585da4e17224f58858970f0ed5ab042c3916b76b0b828e62eaf636cbd"; +static const char *ST_HASH_17500 = "983ba28532cc6320d04f20fa485bcedb38bddb666eca5f1e5aa279ff1c6244fe5f83cf4bbf05b95ff378dd2353617221"; +static const char *ST_HASH_17600 = "7c2dc1d743735d4e069f3bda85b1b7e9172033dfdd8cd599ca094ef8570f3930c3f2c0b7afc8d6152ce4eaad6057a2ff22e71934b3a3dd0fb55a7fc84a53144e"; +static const char *ST_HASH_17700 = "e1dfad9bafeae6ef15f5bbb16cf4c26f09f5f1e7870581962fc84636"; +static const char *ST_HASH_17800 = "203f88777f18bb4ee1226627b547808f38d90d3e106262b5de9ca943b57137b6"; +static const char *ST_HASH_17900 = "5804b7ada5806ba79540100e9a7ef493654ff2a21d94d4f2ce4bf69abda5d94bf03701fe9525a15dfdc625bfbd769701"; +static const char *ST_HASH_18000 = "2fbf5c9080f0a704de2e915ba8fdae6ab00bbc026b2c1c8fa07da1239381c6b7f4dfd399bf9652500da723694a4c719587dd0219cb30eabe61210a8ae4dc0b03"; static const char *ST_HASH_18100 = "597056:3600"; static const char *ST_HASH_99999 = "hashcat"; @@ -369,8 +376,8 @@ static const char *HT_00600 = "BLAKE2b"; static const char *HT_00900 = "MD4"; static const char *HT_01000 = "NTLM"; static const char *HT_01100 = "Domain Cached Credentials (DCC), MS Cache"; -static const char *HT_01300 = "SHA-224"; -static const char *HT_01400 = "SHA-256"; +static const char *HT_01300 = "SHA2-224"; +static const char *HT_01400 = "SHA2-256"; static const char *HT_01410 = "sha256($pass.$salt)"; static const char *HT_01420 = "sha256($salt.$pass)"; static const char *HT_01430 = "sha256(utf16le($pass).$salt)"; @@ -379,7 +386,7 @@ static const char *HT_01450 = "HMAC-SHA256 (key = $pass)"; static const char *HT_01460 = "HMAC-SHA256 (key = $salt)"; static const char *HT_01500 = "descrypt, DES (Unix), Traditional DES"; static const char *HT_01600 = "Apache $apr1$ MD5, md5apr1, MD5 (APR)"; -static const char *HT_01700 = "SHA-512"; +static const char *HT_01700 = "SHA2-512"; static const char *HT_01710 = "sha512($pass.$salt)"; static const char *HT_01720 = "sha512($salt.$pass)"; static const char *HT_01730 = "sha512(utf16le($pass).$salt)"; @@ -409,7 +416,6 @@ static const char *HT_04520 = "sha1($salt.sha1($pass))"; static const char *HT_04700 = "sha1(md5($pass))"; static const char *HT_04800 = "iSCSI CHAP authentication, MD5(CHAP)"; static const char *HT_04900 = "sha1($salt.$pass.$salt)"; -static const char *HT_05000 = "SHA-3 (Keccak)"; static const char *HT_05100 = "Half MD5"; static const char *HT_05200 = "Password Safe v3"; static const char *HT_05300 = "IKE-PSK MD5"; @@ -472,7 +478,7 @@ static const char *HT_10420 = "PDF 1.1 - 1.3 (Acrobat 2 - 4), collider #2"; static const char *HT_10500 = "PDF 1.4 - 1.6 (Acrobat 5 - 8)"; static const char *HT_10600 = "PDF 1.7 Level 3 (Acrobat 9)"; static const char *HT_10700 = "PDF 1.7 Level 8 (Acrobat 10 - 11)"; -static const char *HT_10800 = "SHA-384"; +static const char *HT_10800 = "SHA2-384"; static const char *HT_10900 = "PBKDF2-HMAC-SHA256"; static const char *HT_11000 = "PrestaShop"; static const char *HT_11100 = "PostgreSQL CRAM (MD5)"; @@ -530,6 +536,14 @@ static const char *HT_16700 = "FileVault 2"; static const char *HT_16800 = "WPA-PMKID-PBKDF2"; static const char *HT_16801 = "WPA-PMKID-PMK"; static const char *HT_16900 = "Ansible Vault"; +static const char *HT_17300 = "SHA3-224"; +static const char *HT_17400 = "SHA3-256"; +static const char *HT_17500 = "SHA3-384"; +static const char *HT_17600 = "SHA3-512"; +static const char *HT_17700 = "Keccak-224"; +static const char *HT_17800 = "Keccak-256"; +static const char *HT_17900 = "Keccak-384"; +static const char *HT_18000 = "Keccak-512"; static const char *HT_18100 = "TOTP (HMAC-SHA1)"; static const char *HT_99999 = "Plaintext"; @@ -6222,18 +6236,16 @@ int sha512crypt_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYB return (PARSER_OK); } -int keccak_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig) +int keccak_224_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig) { - u64 *digest = (u64 *) hash_buf->digest; - - salt_t *salt = hash_buf->salt; + u32 *digest = (u32 *) hash_buf->digest; token_t token; token.token_cnt = 1; - token.len_min[0] = 16; - token.len_max[0] = 400; + token.len_min[0] = 56; + token.len_max[0] = 56; token.attr[0] = TOKEN_ATTR_FIXED_LENGTH | TOKEN_ATTR_VERIFY_HEX; @@ -6244,16 +6256,111 @@ int keccak_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNU u8 *hash_pos = token.buf[0]; int hash_len = token.len[0]; - if (hash_len % 16) return (PARSER_GLOBAL_LENGTH); + if (hash_len != 56) return (PARSER_GLOBAL_LENGTH); - u32 keccak_mdlen = hash_len / 2; + digest[0] = hex_to_u32 (hash_pos + 0); + digest[1] = hex_to_u32 (hash_pos + 8); + digest[2] = hex_to_u32 (hash_pos + 16); + digest[3] = hex_to_u32 (hash_pos + 24); + digest[4] = hex_to_u32 (hash_pos + 32); + digest[5] = hex_to_u32 (hash_pos + 40); + digest[6] = hex_to_u32 (hash_pos + 48); - for (u32 i = 0, j = 0; i < keccak_mdlen / 8; i += 1, j += 16) - { - digest[i] = hex_to_u64 (hash_pos + j); - } + return (PARSER_OK); +} - salt->keccak_mdlen = keccak_mdlen; +int keccak_256_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig) +{ + u64 *digest = (u64 *) hash_buf->digest; + + token_t token; + + token.token_cnt = 1; + + token.len_min[0] = 64; + token.len_max[0] = 64; + token.attr[0] = TOKEN_ATTR_FIXED_LENGTH + | TOKEN_ATTR_VERIFY_HEX; + + const int rc_tokenizer = input_tokenizer (input_buf, input_len, &token); + + if (rc_tokenizer != PARSER_OK) return (rc_tokenizer); + + u8 *hash_pos = token.buf[0]; + int hash_len = token.len[0]; + + if (hash_len != 64) return (PARSER_GLOBAL_LENGTH); + + digest[0] = hex_to_u64 (hash_pos + 0); + digest[1] = hex_to_u64 (hash_pos + 16); + digest[2] = hex_to_u64 (hash_pos + 32); + digest[3] = hex_to_u64 (hash_pos + 48); + + return (PARSER_OK); +} + +int keccak_384_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig) +{ + u64 *digest = (u64 *) hash_buf->digest; + + token_t token; + + token.token_cnt = 1; + + token.len_min[0] = 96; + token.len_max[0] = 96; + token.attr[0] = TOKEN_ATTR_FIXED_LENGTH + | TOKEN_ATTR_VERIFY_HEX; + + const int rc_tokenizer = input_tokenizer (input_buf, input_len, &token); + + if (rc_tokenizer != PARSER_OK) return (rc_tokenizer); + + u8 *hash_pos = token.buf[0]; + int hash_len = token.len[0]; + + if (hash_len != 96) return (PARSER_GLOBAL_LENGTH); + + digest[0] = hex_to_u64 (hash_pos + 0); + digest[1] = hex_to_u64 (hash_pos + 16); + digest[2] = hex_to_u64 (hash_pos + 32); + digest[3] = hex_to_u64 (hash_pos + 48); + digest[4] = hex_to_u64 (hash_pos + 64); + digest[5] = hex_to_u64 (hash_pos + 80); + + return (PARSER_OK); +} + +int keccak_512_parse_hash (u8 *input_buf, u32 input_len, hash_t *hash_buf, MAYBE_UNUSED hashconfig_t *hashconfig) +{ + u64 *digest = (u64 *) hash_buf->digest; + + token_t token; + + token.token_cnt = 1; + + token.len_min[0] = 128; + token.len_max[0] = 128; + token.attr[0] = TOKEN_ATTR_FIXED_LENGTH + | TOKEN_ATTR_VERIFY_HEX; + + const int rc_tokenizer = input_tokenizer (input_buf, input_len, &token); + + if (rc_tokenizer != PARSER_OK) return (rc_tokenizer); + + u8 *hash_pos = token.buf[0]; + int hash_len = token.len[0]; + + if (hash_len != 128) return (PARSER_GLOBAL_LENGTH); + + digest[0] = hex_to_u64 (hash_pos + 0); + digest[1] = hex_to_u64 (hash_pos + 16); + digest[2] = hex_to_u64 (hash_pos + 32); + digest[3] = hex_to_u64 (hash_pos + 48); + digest[4] = hex_to_u64 (hash_pos + 64); + digest[5] = hex_to_u64 (hash_pos + 80); + digest[6] = hex_to_u64 (hash_pos + 96); + digest[7] = hex_to_u64 (hash_pos + 112); return (PARSER_OK); } @@ -18305,7 +18412,6 @@ const char *strhashtype (const u32 hash_mode) case 4700: return HT_04700; case 4800: return HT_04800; case 4900: return HT_04900; - case 5000: return HT_05000; case 5100: return HT_05100; case 5200: return HT_05200; case 5300: return HT_05300; @@ -18457,6 +18563,14 @@ const char *strhashtype (const u32 hash_mode) case 16800: return HT_16800; case 16801: return HT_16801; case 16900: return HT_16900; + case 17300: return HT_17300; + case 17400: return HT_17400; + case 17500: return HT_17500; + case 17600: return HT_17600; + case 17700: return HT_17700; + case 17800: return HT_17800; + case 17900: return HT_17900; + case 18000: return HT_18000; case 18100: return HT_18100; case 99999: return HT_99999; } @@ -22173,6 +22287,56 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le byte_swap_32 (digest_buf[6]), byte_swap_32 (digest_buf[7])); } + else if (hash_mode == 17300 || hash_mode == 17700) + { + u32 *ptr = digest_buf; + + snprintf (out_buf, out_len - 1, "%08x%08x%08x%08x%08x%08x%08x", + ptr[1], ptr[0], + ptr[3], ptr[2], + ptr[5], ptr[4], + ptr[7] + ); + } + else if (hash_mode == 17400 || hash_mode == 17800) + { + u32 *ptr = digest_buf; + + snprintf (out_buf, out_len - 1, "%08x%08x%08x%08x%08x%08x%08x%08x", + ptr[1], ptr[0], + ptr[3], ptr[2], + ptr[5], ptr[4], + ptr[7], ptr[6] + ); + } + else if (hash_mode == 17500 || hash_mode == 17900) + { + u32 *ptr = digest_buf; + + snprintf (out_buf, out_len - 1, "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x", + ptr[ 1], ptr[ 0], + ptr[ 3], ptr[ 2], + ptr[ 5], ptr[ 4], + ptr[ 7], ptr[ 6], + ptr[ 9], ptr[ 8], + ptr[11], ptr[10] + ); + } + else if (hash_mode == 17600 || hash_mode == 18000) + { + u32 *ptr = digest_buf; + + snprintf (out_buf, out_len - 1, "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x", + ptr[ 1], ptr[ 0], + ptr[ 3], ptr[ 2], + ptr[ 5], ptr[ 4], + ptr[ 7], ptr[ 6], + ptr[ 9], ptr[ 8], + ptr[11], ptr[10], + ptr[13], ptr[12], + ptr[15], ptr[14] + ); + } else if (hash_mode == 18100) { // salt_buf[1] holds our 32 bit value. salt_buf[0] and salt_buf[1] would be 64 bits. @@ -22284,40 +22448,6 @@ int ascii_digest (hashcat_ctx_t *hashcat_ctx, char *out_buf, const size_t out_le snprintf (out_buf, out_len - 1, "%s$%s", (char *) salt.salt_sign, tmp_buf); } - else if (hash_type == HASH_TYPE_KECCAK) - { - u32 *ptr = digest_buf; - - snprintf (out_buf, out_len - 1, "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x", - ptr[ 1], ptr[ 0], - ptr[ 3], ptr[ 2], - ptr[ 5], ptr[ 4], - ptr[ 7], ptr[ 6], - ptr[ 9], ptr[ 8], - ptr[11], ptr[10], - ptr[13], ptr[12], - ptr[15], ptr[14], - ptr[17], ptr[16], - ptr[19], ptr[18], - ptr[21], ptr[20], - ptr[23], ptr[22], - ptr[25], ptr[24], - ptr[27], ptr[26], - ptr[29], ptr[28], - ptr[31], ptr[30], - ptr[33], ptr[32], - ptr[35], ptr[34], - ptr[37], ptr[36], - ptr[39], ptr[38], - ptr[41], ptr[30], - ptr[43], ptr[42], - ptr[45], ptr[44], - ptr[47], ptr[46], - ptr[49], ptr[48] - ); - - out_buf[salt.keccak_mdlen * 2] = 0; - } else if (hash_type == HASH_TYPE_BLAKE2B) { u32 *ptr = digest_buf; @@ -24582,25 +24712,6 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; break; - case 5000: hashconfig->hash_type = HASH_TYPE_KECCAK; - hashconfig->salt_type = SALT_TYPE_EMBEDDED; - hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; - hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE - | OPTS_TYPE_PT_ADD01; - hashconfig->kern_type = KERN_TYPE_KECCAK; - hashconfig->dgst_size = DGST_SIZE_8_25; - hashconfig->parse_func = keccak_parse_hash; - hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE - | OPTI_TYPE_USES_BITS_64 - | OPTI_TYPE_RAW_HASH; - hashconfig->dgst_pos0 = 6; - hashconfig->dgst_pos1 = 7; - hashconfig->dgst_pos2 = 4; - hashconfig->dgst_pos3 = 5; - hashconfig->st_hash = ST_HASH_05000; - hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; - break; - case 5100: hashconfig->hash_type = HASH_TYPE_MD5H; hashconfig->salt_type = SALT_TYPE_NONE; hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; @@ -27378,6 +27489,150 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; break; + case 17300: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD06; + hashconfig->kern_type = KERN_TYPE_SHA3_224; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_224_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17300; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17400: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD06; + hashconfig->kern_type = KERN_TYPE_SHA3_256; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_256_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17400; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17500: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD06; + hashconfig->kern_type = KERN_TYPE_SHA3_384; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_384_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17500; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17600: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD06; + hashconfig->kern_type = KERN_TYPE_SHA3_512; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_512_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17600; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17700: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD01; + hashconfig->kern_type = KERN_TYPE_KECCAK_224; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_224_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17700; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17800: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD01; + hashconfig->kern_type = KERN_TYPE_KECCAK_256; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_256_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17800; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 17900: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD01; + hashconfig->kern_type = KERN_TYPE_KECCAK_384; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_384_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_17900; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + + case 18000: hashconfig->salt_type = SALT_TYPE_EMBEDDED; + hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; + hashconfig->opts_type = OPTS_TYPE_PT_GENERATE_LE + | OPTS_TYPE_PT_ADD01; + hashconfig->kern_type = KERN_TYPE_KECCAK_512; + hashconfig->dgst_size = DGST_SIZE_8_25; + hashconfig->parse_func = keccak_512_parse_hash; + hashconfig->opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 + | OPTI_TYPE_RAW_HASH; + hashconfig->dgst_pos0 = 6; + hashconfig->dgst_pos1 = 7; + hashconfig->dgst_pos2 = 4; + hashconfig->dgst_pos3 = 5; + hashconfig->st_hash = ST_HASH_18000; + hashconfig->st_pass = ST_PASS_HASHCAT_PLAIN; + break; + case 18100: hashconfig->hash_type = HASH_TYPE_SHA1; hashconfig->salt_type = SALT_TYPE_EMBEDDED; hashconfig->attack_exec = ATTACK_EXEC_INSIDE_KERNEL; @@ -27498,6 +27753,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx) hashconfig->opts_type &= ~OPTS_TYPE_PT_UTF16BE; hashconfig->opts_type &= ~OPTS_TYPE_PT_ADD01; hashconfig->opts_type &= ~OPTS_TYPE_PT_ADD02; + hashconfig->opts_type &= ~OPTS_TYPE_PT_ADD06; hashconfig->opts_type &= ~OPTS_TYPE_PT_ADD80; hashconfig->opts_type &= ~OPTS_TYPE_PT_ADDBITS14; hashconfig->opts_type &= ~OPTS_TYPE_PT_ADDBITS15; @@ -28387,8 +28643,6 @@ void hashconfig_benchmark_defaults (hashcat_ctx_t *hashcat_ctx, salt_t *salt, vo break; case 3100: salt->salt_len = 1; break; - case 5000: salt->keccak_mdlen = 32; - break; case 5800: salt->salt_len = 16; break; case 6800: salt->salt_len = 32; diff --git a/src/main.c b/src/main.c index 93e63e8bf..5b3210196 100644 --- a/src/main.c +++ b/src/main.c @@ -205,6 +205,14 @@ static void main_outerloop_finished (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA hashcat_user_t *hashcat_user = hashcat_ctx->hashcat_user; status_ctx_t *status_ctx = hashcat_ctx->status_ctx; + // we should never stop hashcat with STATUS_INIT: + // keypress thread blocks on STATUS_INIT forever! + + if (status_ctx->devices_status == STATUS_INIT) + { + status_ctx->devices_status = STATUS_ERROR; + } + // wait for outer threads status_ctx->shutdown_outer = true; diff --git a/src/mpsp.c b/src/mpsp.c index 1a63fd1dc..e306e6c26 100644 --- a/src/mpsp.c +++ b/src/mpsp.c @@ -268,7 +268,12 @@ static int mp_expand (hashcat_ctx_t *hashcat_ctx, const char *in_buf, size_t in_ { in_pos++; - if (in_pos == in_len) break; + if (in_pos == in_len) + { + event_log_error (hashcat_ctx, "Syntax error in mask: %s", in_buf); + + return -1; + } u32 p1 = in_buf[in_pos] & 0xff; @@ -306,7 +311,7 @@ static int mp_expand (hashcat_ctx_t *hashcat_ctx, const char *in_buf, size_t in_ break; case '?': rc = mp_add_cs_buf (hashcat_ctx, &p0, 1, mp_usr, mp_usr_offset); break; - default: event_log_error (hashcat_ctx, "Syntax error: %s", in_buf); + default: event_log_error (hashcat_ctx, "Syntax error in mask: %s", in_buf); return -1; } @@ -372,7 +377,12 @@ static int mp_gen_css (hashcat_ctx_t *hashcat_ctx, char *mask_buf, size_t mask_l { mask_pos++; - if (mask_pos == mask_len) break; + if (mask_pos == mask_len) + { + event_log_error (hashcat_ctx, "Syntax error in mask: %s", mask_buf); + + return -1; + } char p1 = mask_buf[mask_pos]; @@ -412,7 +422,7 @@ static int mp_gen_css (hashcat_ctx_t *hashcat_ctx, char *mask_buf, size_t mask_l break; case '?': rc = mp_add_cs_buf (hashcat_ctx, &chr, 1, css_buf, css_pos); break; - default: event_log_error (hashcat_ctx, "Syntax error: %s", mask_buf); + default: event_log_error (hashcat_ctx, "Syntax error in mask: %s", mask_buf); return -1; } diff --git a/src/opencl.c b/src/opencl.c index c8f618ae1..6e31855f7 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -35,6 +35,7 @@ static const char *drm_card0_driver_path = "/sys/class/drm/card0/device/driver"; #endif static const u32 full01 = 0x01010101; +static const u32 full06 = 0x06060606; static const u32 full80 = 0x80808080; static double TARGET_MSEC_PROFILE[4] = { 2, 12, 96, 480 }; @@ -2153,6 +2154,10 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const { rebuild_pws_compressed_append (device_param, pws_cnt, 0x01); } + else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + rebuild_pws_compressed_append (device_param, pws_cnt, 0x06); + } else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { rebuild_pws_compressed_append (device_param, pws_cnt, 0x80); @@ -2165,6 +2170,10 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const { rebuild_pws_compressed_append (device_param, pws_cnt, 0x01); } + else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + rebuild_pws_compressed_append (device_param, pws_cnt, 0x06); + } else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { rebuild_pws_compressed_append (device_param, pws_cnt, 0x80); @@ -2472,6 +2481,11 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co ptr[line_len] = 0x80; } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + ptr[line_len] = 0x06; + } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) { ptr[line_len] = 0x01; @@ -2590,6 +2604,11 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co ptr[line_len] = 0x80; } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + ptr[line_len] = 0x06; + } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) { ptr[line_len] = 0x01; @@ -6313,6 +6332,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_mp_buf32[7] = 0; if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) device_param->kernel_params_mp_buf32[5] = full01; + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) device_param->kernel_params_mp_buf32[5] = full06; if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) device_param->kernel_params_mp_buf32[5] = full80; if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_buf32[6] = 1; if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_buf32[7] = 1; @@ -6337,6 +6357,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_mp_l_buf32[8] = 0; if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) device_param->kernel_params_mp_l_buf32[6] = full01; + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) device_param->kernel_params_mp_l_buf32[6] = full06; if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) device_param->kernel_params_mp_l_buf32[6] = full80; if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS14) device_param->kernel_params_mp_l_buf32[7] = 1; if (hashconfig->opts_type & OPTS_TYPE_PT_ADDBITS15) device_param->kernel_params_mp_l_buf32[8] = 1; diff --git a/src/selftest.c b/src/selftest.c index 5f17e853e..99a946991 100644 --- a/src/selftest.c +++ b/src/selftest.c @@ -126,6 +126,11 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param comb_ptr[comb.pw_len] = 0x01; } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + comb_ptr[comb.pw_len] = 0x06; + } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { comb_ptr[comb.pw_len] = 0x80; @@ -264,6 +269,11 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param pw_ptr[new_pass_len] = 0x01; } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD06) + { + pw_ptr[new_pass_len] = 0x06; + } + if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { pw_ptr[new_pass_len] = 0x80; diff --git a/src/usage.c b/src/usage.c index a3698c314..c5a82955b 100644 --- a/src/usage.c +++ b/src/usage.c @@ -125,11 +125,18 @@ static const char *const USAGE_BIG[] = " 0 | MD5 | Raw Hash", " 5100 | Half MD5 | Raw Hash", " 100 | SHA1 | Raw Hash", - " 1300 | SHA-224 | Raw Hash", - " 1400 | SHA-256 | Raw Hash", - " 10800 | SHA-384 | Raw Hash", - " 1700 | SHA-512 | Raw Hash", - " 5000 | SHA-3 (Keccak) | Raw Hash", + " 1300 | SHA2-224 | Raw Hash", + " 1400 | SHA2-256 | Raw Hash", + " 10800 | SHA2-384 | Raw Hash", + " 1700 | SHA2-512 | Raw Hash", + " 17300 | SHA3-224 | Raw Hash", + " 17400 | SHA3-256 | Raw Hash", + " 17500 | SHA3-384 | Raw Hash", + " 17600 | SHA3-512 | Raw Hash", + " 17700 | Keccak-224 | Raw Hash", + " 17800 | Keccak-256 | Raw Hash", + " 17900 | Keccak-384 | Raw Hash", + " 18000 | Keccak-512 | Raw Hash", " 600 | BLAKE2b-512 | Raw Hash", " 10100 | SipHash | Raw Hash", " 6000 | RIPEMD-160 | Raw Hash", diff --git a/src/user_options.c b/src/user_options.c index 9f0e6ebc0..acae2b618 100644 --- a/src/user_options.c +++ b/src/user_options.c @@ -977,7 +977,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx) if (user_options->attack_mode == ATTACK_MODE_COMBI) { - event_log_error (hashcat_ctx, "Custom charsets re not supported in attack mode 1 (combination)."); + event_log_error (hashcat_ctx, "Custom charsets are not supported in attack mode 1 (combination)."); return -1; } diff --git a/tools/test.pl b/tools/test.pl index acd577df2..c5620d27b 100755 --- a/tools/test.pl +++ b/tools/test.pl @@ -7,7 +7,7 @@ ## install help: ## -## cpan install Authen::Passphrase::LANManager Authen::Passphrase::MySQL323 Authen::Passphrase::NTHash Authen::Passphrase::PHPass Crypt::CBC Crypt::DES Crypt::Digest::RIPEMD160 Crypt::Digest::Whirlpool Crypt::ECB Crypt::Eksblowfish::Bcrypt Crypt::Mode::ECB Crypt::MySQL Crypt::OpenSSH::ChachaPoly Crypt::PBKDF2 Crypt::RC4 Crypt::Rijndael Crypt::ScryptKDF Crypt::Skip32 Crypt::Twofish Crypt::UnixCrypt_XS Digest::BLAKE2 Digest::CMAC Digest::CRC Digest::GOST Digest::HMAC Digest::HMAC_MD5 Digest::Keccak Digest::MD4 Digest::MD5 Digest::Perl::MD5 Digest::SHA Digest::SipHash JSON Net::DNS::RR::NSEC3 Net::DNS::SEC Convert::EBCDIC +## cpan install Authen::Passphrase::LANManager Authen::Passphrase::MySQL323 Authen::Passphrase::NTHash Authen::Passphrase::PHPass Crypt::CBC Crypt::DES Crypt::Digest::RIPEMD160 Crypt::Digest::Whirlpool Crypt::ECB Crypt::Eksblowfish::Bcrypt Crypt::Mode::ECB Crypt::MySQL Crypt::OpenSSH::ChachaPoly Crypt::PBKDF2 Crypt::RC4 Crypt::Rijndael Crypt::ScryptKDF Crypt::Skip32 Crypt::Twofish Crypt::UnixCrypt_XS Digest::BLAKE2 Digest::CMAC Digest::CRC Digest::GOST Digest::HMAC Digest::HMAC_MD5 Digest::Keccak Digest::MD4 Digest::MD5 Digest::Perl::MD5 Digest::SHA Digest::SHA3 Digest::SipHash JSON Net::DNS::RR::NSEC3 Net::DNS::SEC Convert::EBCDIC ## use strict; @@ -15,8 +15,9 @@ use warnings; use Digest::MD4 qw (md4 md4_hex); use Digest::MD5 qw (md5 md5_hex); use Digest::SHA qw (sha1 sha256 sha384 sha512 sha1_hex sha224_hex sha256_hex sha384_hex sha512_hex hmac_sha1 hmac_sha256 hmac_sha512); +use Digest::SHA3 qw (sha3_224_hex sha3_256_hex sha3_384_hex sha3_512_hex); +use Digest::Keccak qw (keccak_224_hex keccak_256_hex keccak_384_hex keccak_512_hex); use Digest::HMAC qw (hmac hmac_hex); -use Digest::Keccak qw (keccak_256_hex); use Digest::BLAKE2 qw (blake2b_hex); use Crypt::MySQL qw (password41); use Digest::GOST qw (gost gost_hex); @@ -58,7 +59,7 @@ my $hashcat = "./hashcat"; my $MAX_LEN = 55; -my @modes = (0, 10, 11, 12, 20, 21, 22, 23, 30, 40, 50, 60, 100, 101, 110, 111, 112, 120, 121, 122, 125, 130, 131, 132, 133, 140, 141, 150, 160, 200, 300, 400, 500, 600, 900, 1000, 1100, 1300, 1400, 1410, 1411, 1420, 1430, 1440, 1441, 1450, 1460, 1500, 1600, 1700, 1710, 1711, 1720, 1730, 1740, 1722, 1731, 1750, 1760, 1800, 2100, 2400, 2410, 2500, 2600, 2611, 2612, 2711, 2811, 3000, 3100, 3200, 3710, 3711, 3300, 3500, 3610, 3720, 3800, 3910, 4010, 4110, 4210, 4300, 4400, 4500, 4520, 4521, 4522, 4600, 4700, 4800, 4900, 5000, 5100, 5300, 5400, 5500, 5600, 5700, 5800, 6000, 6100, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7700, 7701, 7800, 7801, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8900, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11900, 12000, 12001, 12100, 12200, 12300, 12400, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13800, 13900, 14000, 14100, 14400, 14700, 14800, 14900, 15000, 15100, 15200, 15300, 15400, 15500, 15600, 15700, 15900, 16000, 16100, 16200, 16300, 16400, 16500, 16600, 16700, 16800, 16900, 18100, 99999); +my @modes = (0, 10, 11, 12, 20, 21, 22, 23, 30, 40, 50, 60, 100, 101, 110, 111, 112, 120, 121, 122, 125, 130, 131, 132, 133, 140, 141, 150, 160, 200, 300, 400, 500, 600, 900, 1000, 1100, 1300, 1400, 1410, 1411, 1420, 1430, 1440, 1441, 1450, 1460, 1500, 1600, 1700, 1710, 1711, 1720, 1730, 1740, 1722, 1731, 1750, 1760, 1800, 2100, 2400, 2410, 2500, 2600, 2611, 2612, 2711, 2811, 3000, 3100, 3200, 3710, 3711, 3300, 3500, 3610, 3720, 3800, 3910, 4010, 4110, 4210, 4300, 4400, 4500, 4520, 4521, 4522, 4600, 4700, 4800, 4900, 5100, 5300, 5400, 5500, 5600, 5700, 5800, 6000, 6100, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7700, 7701, 7800, 7801, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8900, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11900, 12000, 12001, 12100, 12200, 12300, 12400, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13800, 13900, 14000, 14100, 14400, 14700, 14800, 14900, 15000, 15100, 15200, 15300, 15400, 15500, 15600, 15700, 15900, 16000, 16100, 16200, 16300, 16400, 16500, 16600, 16700, 16800, 16900, 17300, 17400, 17500, 17600, 17700, 17800, 17900, 18000, 18100, 99999); my %is_utf16le = map { $_ => 1 } qw (30 40 130 131 132 133 140 141 1000 1100 1430 1440 1441 1730 1740 1731 5500 5600 8000 9400 9500 9600 9700 9800 11600 13500 13800); my %less_fifteen = map { $_ => 1 } qw (500 1600 1800 3200 6300 7400 10500 10700); @@ -226,7 +227,7 @@ sub verify # remember always do "exists ($db->{$hash_in})" checks as soon as possible and don't forget it # unsalted - if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3000 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5000 || $mode == 5100 || $mode == 5700 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 8600 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 16000 || $mode == 16400 || $mode == 99999) + if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3000 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5100 || $mode == 5700 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 8600 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 16000 || $mode == 16400 || $mode == 17300 || $mode == 17400 || $mode == 17500 || $mode == 17600 || $mode == 17700 || $mode == 17800 || $mode == 17900 || $mode == 18000 || $mode == 99999) { my $index = index ($line, ":"); @@ -3547,7 +3548,7 @@ sub passthrough my $tmp_hash; - if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5000 || $mode == 5100 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 5700 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 13300 || $mode == 16400 || $mode == 99999) + if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2400 || $mode == 2600 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5100 || $mode == 6000 || $mode == 6100 || $mode == 6900 || $mode == 5700 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 13300 || $mode == 16400 || $mode == 17300 || $mode == 17400 || $mode == 17500 || $mode == 17600 || $mode == 17700 || $mode == 17800 || $mode == 17900 || $mode == 18000 || $mode == 99999) { $tmp_hash = gen_hash ($mode, $word_buf, ""); } @@ -4077,7 +4078,7 @@ sub single { my $mode = $modes[$j]; - if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2600 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5000 || $mode == 5100 || $mode == 5300 || $mode == 5400 || $mode == 6000 || $mode == 6100 || $mode == 6600 || $mode == 6900 || $mode == 5700 || $mode == 8200 || $mode == 8300 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 13300 || $mode == 16400 || $mode == 99999) + if ($mode == 0 || $mode == 100 || $mode == 101 || $mode == 133 || $mode == 200 || $mode == 300 || $mode == 600 || $mode == 900 || $mode == 1000 || $mode == 1300 || $mode == 1400 || $mode == 1700 || $mode == 2600 || $mode == 3500 || $mode == 4300 || $mode == 4400 || $mode == 4500 || $mode == 4600 || $mode == 4700 || $mode == 5100 || $mode == 5300 || $mode == 5400 || $mode == 6000 || $mode == 6100 || $mode == 6600 || $mode == 6900 || $mode == 5700 || $mode == 8200 || $mode == 8300 || $mode == 9900 || $mode == 10800 || $mode == 11500 || $mode == 13300 || $mode == 16400 || $mode == 17300 || $mode == 17400 || $mode == 17500 || $mode == 17600 || $mode == 17700 || $mode == 17800 || $mode == 17900 || $mode == 18000 || $mode == 99999) { for (my $i = 1; $i < 32; $i++) { @@ -6078,12 +6079,6 @@ sub gen_hash $tmp_hash = sprintf ("%s:%s", $hash_buf, $salt_buf); } - elsif ($mode == 5000) - { - $hash_buf = keccak_256_hex ($word_buf); - - $tmp_hash = sprintf ("%s", $hash_buf); - } elsif ($mode == 5100) { my $pos; @@ -9929,25 +9924,68 @@ END_CODE $tmp_hash = sprintf ("%s*%s*%s*%s", substr ($pmkid, 0, 32), $macap, $macsta, $essid); } + elsif ($mode == 17300) + { + $hash_buf = sha3_224_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17400) + { + $hash_buf = sha3_256_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17500) + { + $hash_buf = sha3_384_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17600) + { + $hash_buf = sha3_512_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17700) + { + $hash_buf = keccak_224_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17800) + { + $hash_buf = keccak_256_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 17900) + { + $hash_buf = keccak_384_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } + elsif ($mode == 18000) + { + $hash_buf = keccak_512_hex ($word_buf); + + $tmp_hash = sprintf ("%s", $hash_buf); + } elsif ($mode == 18100) { my $paddedTime = sprintf("%016x", int(int($salt_buf) / 30)); my $data = pack('H*', $paddedTime); my $key = $word_buf; - #my $b32_salt_buf = encode_base32($salt_buf); - #print "SECRET: ". $b32_salt_buf . ('=' x (8 - (length($b32_salt_buf) % 8))) ." "; $hash_buf = hmac_hex ($data, $key, \&sha1, 64); - #$hash_buf = hmac ($word_buf, $salt_buf, \&sha1, 64); my $offset = hex (substr ($hash_buf, -8)) & 0xf; $offset *= 2; my $token = hex (substr ($hash_buf, $offset, 8)); $token &= 0x7fffffff; $token %= 1000000; - #print "CODE: " . $token . "\n"; - #$tmp_hash = sprintf ("%s:%s", $hash_buf, $salt_buf); ## token must be leading zero padded, and salt leading zero stripped $tmp_hash = sprintf ("%06d:%d", $token, int($salt_buf)); } diff --git a/tools/test.sh b/tools/test.sh index d3468f86a..005acaafa 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -9,7 +9,7 @@ TDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # missing hash types: 5200,6251,6261,6271,6281 -HASH_TYPES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 125 130 131 132 133 140 141 150 160 200 300 400 500 600 900 1000 1100 1300 1400 1410 1411 1420 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 3910 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5000 5100 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7701 7800 7801 7900 8000 8100 8200 8300 8400 8500 8600 8700 8900 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11900 12000 12001 12100 12200 12300 12400 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14400 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 99999" +HASH_TYPES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 125 130 131 132 133 140 141 150 160 200 300 400 500 600 900 1000 1100 1300 1400 1410 1411 1420 1430 1440 1441 1450 1460 1500 1600 1700 1710 1711 1720 1722 1730 1731 1740 1750 1760 1800 2100 2400 2410 2500 2600 2611 2612 2711 2811 3000 3100 3200 3710 3711 3800 3910 4010 4110 4300 4400 4500 4520 4521 4522 4700 4800 4900 5100 5300 5400 5500 5600 5700 5800 6000 6100 6211 6212 6213 6221 6222 6223 6231 6232 6233 6241 6242 6243 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7700 7701 7800 7801 7900 8000 8100 8200 8300 8400 8500 8600 8700 8900 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11900 12000 12001 12100 12200 12300 12400 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13800 13900 14000 14100 14400 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 17300 17400 17500 17600 17700 17800 17900 18000 99999" #ATTACK_MODES="0 1 3 6 7" ATTACK_MODES="0 1 3 7"