From a71c69983d2b3043478410359ea96c1ae8bf0d95 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 21 Feb 2018 10:50:24 +0100 Subject: [PATCH] Make words_buf_r in DES bitsliced kernels __constant --- OpenCL/m01500_a3.cl | 100 ++++++++++++++------------------------------ OpenCL/m03000_a3.cl | 92 +++++++++++++--------------------------- OpenCL/m14000_a3.cl | 96 +++++++++++++----------------------------- 3 files changed, 90 insertions(+), 198 deletions(-) diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3.cl index 6b6940870..3c4dca3fe 100644 --- a/OpenCL/m01500_a3.cl +++ b/OpenCL/m01500_a3.cl @@ -1881,7 +1881,37 @@ DECLSPEC void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -DECLSPEC void m01500m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +// +// transpose bitslice mod : attention race conditions, need different buffers for *in and *out +// + +__kernel void m01500_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +{ + const u64 gid = get_global_id (0); + + const u32 block = gid / 32; + const u32 slice = gid % 32; + + const u32 w0 = mod[gid]; + + const u32 w0s = (w0 << 1) & 0xfefefefe; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0, j = 0; i < 32; i += 8, j += 7) + { + atomic_or (&words_buf_r[block].b[j + 0], (((w0s >> (i + 7)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 1], (((w0s >> (i + 6)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 2], (((w0s >> (i + 5)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 3], (((w0s >> (i + 4)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 4], (((w0s >> (i + 3)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 5], (((w0s >> (i + 2)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 6], (((w0s >> (i + 1)) & 1) << slice)); + } +} + +__kernel void m01500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2253,7 +2283,7 @@ DECLSPEC void m01500m (__global pw_t *pws, __global const kernel_rule_t *rules_b } } -DECLSPEC void m01500s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +__kernel void m01500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2641,69 +2671,3 @@ DECLSPEC void m01500s (__global pw_t *pws, __global const kernel_rule_t *rules_b #include COMPARE_S } - -// -// transpose bitslice mod : attention race conditions, need different buffers for *in and *out -// - -__kernel void m01500_tm (__global u32 *mod, __global bs_word_t *words_buf_r) -{ - const u64 gid = get_global_id (0); - - const u32 block = gid / 32; - const u32 slice = gid % 32; - - const u32 w0 = mod[gid]; - - const u32 w0s = (w0 << 1) & 0xfefefefe; - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0, j = 0; i < 32; i += 8, j += 7) - { - atomic_or (&words_buf_r[block].b[j + 0], (((w0s >> (i + 7)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 1], (((w0s >> (i + 6)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 2], (((w0s >> (i + 5)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 3], (((w0s >> (i + 4)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 4], (((w0s >> (i + 3)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 5], (((w0s >> (i + 2)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 6], (((w0s >> (i + 1)) & 1) << slice)); - } -} - -__kernel void m01500_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m01500m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m01500_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m01500s (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3.cl index 7ac27720b..c5d8419eb 100644 --- a/OpenCL/m03000_a3.cl +++ b/OpenCL/m03000_a3.cl @@ -1726,7 +1726,33 @@ DECLSPEC void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -DECLSPEC void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +// +// transpose bitslice mod : attention race conditions, need different buffers for *in and *out +// + +__kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +{ + const u64 gid = get_global_id (0); + + const u32 block = gid / 32; + const u32 slice = gid % 32; + + const u32 w0 = mod[gid]; + + for (int i = 0; i < 32; i += 8) + { + atomic_or (&words_buf_r[block].b[i + 0], (((w0 >> (i + 7)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 1], (((w0 >> (i + 6)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 2], (((w0 >> (i + 5)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 3], (((w0 >> (i + 4)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 4], (((w0 >> (i + 3)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 5], (((w0 >> (i + 2)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 6], (((w0 >> (i + 1)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[i + 7], (((w0 >> (i + 0)) & 1) << slice)); + } +} + +__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2098,7 +2124,7 @@ DECLSPEC void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_b } } -DECLSPEC void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2486,65 +2512,3 @@ DECLSPEC void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_b #include COMPARE_S } - -// -// transpose bitslice mod : attention race conditions, need different buffers for *in and *out -// - -__kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) -{ - const u64 gid = get_global_id (0); - - const u32 block = gid / 32; - const u32 slice = gid % 32; - - const u32 w0 = mod[gid]; - - for (int i = 0; i < 32; i += 8) - { - atomic_or (&words_buf_r[block].b[i + 0], (((w0 >> (i + 7)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 1], (((w0 >> (i + 6)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 2], (((w0 >> (i + 5)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 3], (((w0 >> (i + 4)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 4], (((w0 >> (i + 3)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 5], (((w0 >> (i + 2)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 6], (((w0 >> (i + 1)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[i + 7], (((w0 >> (i + 0)) & 1) << slice)); - } -} - -__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m03000m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m03000s (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} diff --git a/OpenCL/m14000_a3.cl b/OpenCL/m14000_a3.cl index f7b59e283..cd8cbd497 100644 --- a/OpenCL/m14000_a3.cl +++ b/OpenCL/m14000_a3.cl @@ -1726,7 +1726,35 @@ DECLSPEC void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -DECLSPEC void m14000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +// +// transpose bitslice mod : attention race conditions, need different buffers for *in and *out +// + +__kernel void m14000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) +{ + const u64 gid = get_global_id (0); + + const u32 block = gid / 32; + const u32 slice = gid % 32; + + const u32 w0 = mod[gid]; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0, j = 0; i < 32; i += 8, j += 7) + { + atomic_or (&words_buf_r[block].b[j + 0], (((w0 >> (i + 7)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 1], (((w0 >> (i + 6)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 2], (((w0 >> (i + 5)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 3], (((w0 >> (i + 4)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 4], (((w0 >> (i + 3)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 5], (((w0 >> (i + 2)) & 1) << slice)); + atomic_or (&words_buf_r[block].b[j + 6], (((w0 >> (i + 1)) & 1) << slice)); + } +} + +__kernel void m14000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2234,7 +2262,7 @@ DECLSPEC void m14000m (__global pw_t *pws, __global const kernel_rule_t *rules_b } } -DECLSPEC void m14000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +__kernel void m14000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) { /** * base @@ -2686,67 +2714,3 @@ DECLSPEC void m14000s (__global pw_t *pws, __global const kernel_rule_t *rules_b #include COMPARE_S } - -// -// transpose bitslice mod : attention race conditions, need different buffers for *in and *out -// - -__kernel void m14000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) -{ - const u64 gid = get_global_id (0); - - const u32 block = gid / 32; - const u32 slice = gid % 32; - - const u32 w0 = mod[gid]; - - #ifdef _unroll - #pragma unroll - #endif - for (int i = 0, j = 0; i < 32; i += 8, j += 7) - { - atomic_or (&words_buf_r[block].b[j + 0], (((w0 >> (i + 7)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 1], (((w0 >> (i + 6)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 2], (((w0 >> (i + 5)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 3], (((w0 >> (i + 4)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 4], (((w0 >> (i + 3)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 5], (((w0 >> (i + 2)) & 1) << slice)); - atomic_or (&words_buf_r[block].b[j + 6], (((w0 >> (i + 1)) & 1) << slice)); - } -} - -__kernel void m14000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m14000m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -} - -__kernel void m14000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bs_word_t *words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max) -{ - /** - * base - */ - - const u64 gid = get_global_id (0); - const u64 lid = get_local_id (0); - - if (gid >= gid_max) return; - - /** - * main - */ - - m14000s (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); -}