diff --git a/OpenCL/m01500_a0.cl b/OpenCL/m01500_a0.cl index 9f061b996..7bc358c18 100644 --- a/OpenCL/m01500_a0.cl +++ b/OpenCL/m01500_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -348,6 +349,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 (*s_skb)[64]) diff --git a/OpenCL/m02610_a0.cl b/OpenCL/m02610_a0.cl index 37a435532..f8b3572c0 100644 --- a/OpenCL/m02610_a0.cl +++ b/OpenCL/m02610_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m02610_a3.cl b/OpenCL/m02610_a3.cl index 9bd0f751c..0fdd37f68 100644 --- a/OpenCL/m02610_a3.cl +++ b/OpenCL/m02610_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m02610m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m02710_a0.cl b/OpenCL/m02710_a0.cl index a0002bbe6..aefd4991a 100644 --- a/OpenCL/m02710_a0.cl +++ b/OpenCL/m02710_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m02710_a3.cl b/OpenCL/m02710_a3.cl index 3518f17db..d943ba2ee 100644 --- a/OpenCL/m02710_a3.cl +++ b/OpenCL/m02710_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m02810_a0.cl b/OpenCL/m02810_a0.cl index c5abe34e5..42b49d5e8 100644 --- a/OpenCL/m02810_a0.cl +++ b/OpenCL/m02810_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 43f326140..467be23dc 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m03000_a0.cl b/OpenCL/m03000_a0.cl index b441d5879..88fcbd12c 100644 --- a/OpenCL/m03000_a0.cl +++ b/OpenCL/m03000_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -351,6 +352,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m03100_a0.cl b/OpenCL/m03100_a0.cl index cdcdf09e5..fc5bea937 100644 --- a/OpenCL/m03100_a0.cl +++ b/OpenCL/m03100_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -366,6 +367,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m03100_a3.cl b/OpenCL/m03100_a3.cl index 01784d284..0190c295a 100644 --- a/OpenCL/m03100_a3.cl +++ b/OpenCL/m03100_a3.cl @@ -1,6 +1,7 @@ /** / s_skb * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -364,6 +365,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 0dbf39f41..2d9be8516 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m03710_a3.cl b/OpenCL/m03710_a3.cl index b6ddf278d..17ebf2f2a 100644 --- a/OpenCL/m03710_a3.cl +++ b/OpenCL/m03710_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m04310_a0.cl b/OpenCL/m04310_a0.cl index b1fdad9cf..be92f86fd 100644 --- a/OpenCL/m04310_a0.cl +++ b/OpenCL/m04310_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m04310_a3.cl b/OpenCL/m04310_a3.cl index 4de070349..0ef9d7e1c 100644 --- a/OpenCL/m04310_a3.cl +++ b/OpenCL/m04310_a3.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -28,6 +30,8 @@ #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m04310m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) diff --git a/OpenCL/m04400_a0.cl b/OpenCL/m04400_a0.cl index 1d5ddfdbd..7764ac4b3 100644 --- a/OpenCL/m04400_a0.cl +++ b/OpenCL/m04400_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m04400_a3.cl b/OpenCL/m04400_a3.cl index aef6da76c..94d00fc0a 100644 --- a/OpenCL/m04400_a3.cl +++ b/OpenCL/m04400_a3.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -28,6 +30,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) diff --git a/OpenCL/m04500_a0.cl b/OpenCL/m04500_a0.cl index 9e679dd19..3168959fe 100644 --- a/OpenCL/m04500_a0.cl +++ b/OpenCL/m04500_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m04500_a3.cl b/OpenCL/m04500_a3.cl index 28fd960ac..65f2bf900 100644 --- a/OpenCL/m04500_a3.cl +++ b/OpenCL/m04500_a3.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -28,6 +30,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) diff --git a/OpenCL/m04700_a0.cl b/OpenCL/m04700_a0.cl index 37c733fd1..7c7c54301 100644 --- a/OpenCL/m04700_a0.cl +++ b/OpenCL/m04700_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -31,6 +33,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m04700_a3.cl b/OpenCL/m04700_a3.cl index da2118277..3b5358a60 100644 --- a/OpenCL/m04700_a3.cl +++ b/OpenCL/m04700_a3.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -29,6 +31,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) diff --git a/OpenCL/m05500_a0.cl b/OpenCL/m05500_a0.cl index 798941a01..92580eafc 100644 --- a/OpenCL/m05500_a0.cl +++ b/OpenCL/m05500_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -348,6 +349,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m05500_a3.cl b/OpenCL/m05500_a3.cl index d0036550f..937ef6749 100644 --- a/OpenCL/m05500_a3.cl +++ b/OpenCL/m05500_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -346,6 +347,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index 9856498db..31753da5d 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -34,6 +35,8 @@ #define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif __constant u32 Ch[8][256] = diff --git a/OpenCL/m06100_a3.cl b/OpenCL/m06100_a3.cl index 2e40381b1..6e5399a8f 100644 --- a/OpenCL/m06100_a3.cl +++ b/OpenCL/m06100_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -32,6 +33,8 @@ #define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif __constant u32 Ch[8][256] = diff --git a/OpenCL/m06900_a0.cl b/OpenCL/m06900_a0.cl index f313b80fc..b6a707d5e 100644 --- a/OpenCL/m06900_a0.cl +++ b/OpenCL/m06900_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -300,6 +301,8 @@ __constant u32 c_tables[4][256] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define _round(k1,k2,tbl) \ diff --git a/OpenCL/m06900_a3.cl b/OpenCL/m06900_a3.cl index 328259245..5d1fef8b2 100644 --- a/OpenCL/m06900_a3.cl +++ b/OpenCL/m06900_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -298,6 +299,8 @@ __constant u32 c_tables[4][256] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define _round(k1,k2,tbl) \ diff --git a/OpenCL/m07600_a0.cl b/OpenCL/m07600_a0.cl index 4ac6d1af8..db4e3be2b 100644 --- a/OpenCL/m07600_a0.cl +++ b/OpenCL/m07600_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m07600_a3.cl b/OpenCL/m07600_a3.cl index f9b793f29..01a461de1 100644 --- a/OpenCL/m07600_a3.cl +++ b/OpenCL/m07600_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m08400_a0.cl b/OpenCL/m08400_a0.cl index 553c925a3..fd51ed0b0 100644 --- a/OpenCL/m08400_a0.cl +++ b/OpenCL/m08400_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl index 6b91abec4..a1cae8cde 100644 --- a/OpenCL/m08400_a3.cl +++ b/OpenCL/m08400_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) diff --git a/OpenCL/m08500_a0.cl b/OpenCL/m08500_a0.cl index 71bdc2805..3e4ea5170 100644 --- a/OpenCL/m08500_a0.cl +++ b/OpenCL/m08500_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -388,6 +389,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #if VECT_SIZE == 1 @@ -398,6 +401,8 @@ __constant u32 c_skb[8][64] = #define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m08500_a3.cl b/OpenCL/m08500_a3.cl index 2e9b9c5be..d9ad1a468 100644 --- a/OpenCL/m08500_a3.cl +++ b/OpenCL/m08500_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -386,6 +387,8 @@ __constant u32 c_skb[8][64] = #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #if VECT_SIZE == 1 @@ -396,6 +399,8 @@ __constant u32 c_skb[8][64] = #define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 (*s_SPtrans)[64]) diff --git a/OpenCL/m08600_a0.cl b/OpenCL/m08600_a0.cl index df78e9dca..21c59ed91 100644 --- a/OpenCL/m08600_a0.cl +++ b/OpenCL/m08600_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -68,6 +69,8 @@ __constant u32 lotus_magic_table[256] = #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) diff --git a/OpenCL/m08600_a3.cl b/OpenCL/m08600_a3.cl index 6fe384556..d4c66032f 100644 --- a/OpenCL/m08600_a3.cl +++ b/OpenCL/m08600_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -66,6 +67,8 @@ __constant u32 lotus_magic_table[256] = #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) diff --git a/OpenCL/m08700_a0.cl b/OpenCL/m08700_a0.cl index c49b34af3..98c059319 100644 --- a/OpenCL/m08700_a0.cl +++ b/OpenCL/m08700_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -68,6 +69,8 @@ __constant u32 lotus_magic_table[256] = #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif #if VECT_SIZE == 1 @@ -78,6 +81,8 @@ __constant u32 lotus_magic_table[256] = #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) diff --git a/OpenCL/m08700_a3.cl b/OpenCL/m08700_a3.cl index ef96a74bc..722e5ab89 100644 --- a/OpenCL/m08700_a3.cl +++ b/OpenCL/m08700_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -68,6 +69,8 @@ __constant u32 lotus_magic_table[256] = #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif #if VECT_SIZE == 1 @@ -78,6 +81,8 @@ __constant u32 lotus_magic_table[256] = #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) #elif VECT_SIZE == 8 #define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#elif VECT_SIZE == 16 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7], (S)[(i).s8], (S)[(i).s9], (S)[(i).sa], (S)[(i).sb], (S)[(i).sc], (S)[(i).sd], (S)[(i).se], (S)[(i).sf]) #endif static void lotus_mix (u32x *in, __local u32 *s_lotus_magic_table) diff --git a/OpenCL/m11100_a0.cl b/OpenCL/m11100_a0.cl index e4c38c942..2dc585eff 100644 --- a/OpenCL/m11100_a0.cl +++ b/OpenCL/m11100_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m11100_a3.cl b/OpenCL/m11100_a3.cl index 03ffd42cb..4a27ef151 100644 --- a/OpenCL/m11100_a3.cl +++ b/OpenCL/m11100_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/m11400_a0.cl b/OpenCL/m11400_a0.cl index 1fbc8987d..9b3ffa3d8 100644 --- a/OpenCL/m11400_a0.cl +++ b/OpenCL/m11400_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl index 322f73022..87d9511ee 100644 --- a/OpenCL/m11400_a3.cl +++ b/OpenCL/m11400_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) diff --git a/OpenCL/m11500_a0.cl b/OpenCL/m11500_a0.cl index 9b6d1bd40..12f86528b 100644 --- a/OpenCL/m11500_a0.cl +++ b/OpenCL/m11500_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -104,6 +106,8 @@ static u32x round_crc32 (u32x a, const u32x v) a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]); #elif VECT_SIZE == 8 a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]); + #elif VECT_SIZE == 16 + a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]); #endif a ^= s; diff --git a/OpenCL/m11500_a3.cl b/OpenCL/m11500_a3.cl index 166dd2413..347ca01c8 100644 --- a/OpenCL/m11500_a3.cl +++ b/OpenCL/m11500_a3.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -102,6 +104,8 @@ static u32x round_crc32 (u32x a, const u32x v) a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]); #elif VECT_SIZE == 8 a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]); + #elif VECT_SIZE == 16 + a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7], crc32tab[k.s8], crc32tab[k.s9], crc32tab[k.sa], crc32tab[k.sb], crc32tab[k.sc], crc32tab[k.sd], crc32tab[k.se], crc32tab[k.sf]); #endif a ^= s; diff --git a/OpenCL/m11700_a0.cl b/OpenCL/m11700_a0.cl index a495115ef..555895bd2 100644 --- a/OpenCL/m11700_a0.cl +++ b/OpenCL/m11700_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -34,6 +35,8 @@ #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define SBOG_LPSti64 \ diff --git a/OpenCL/m11700_a3.cl b/OpenCL/m11700_a3.cl index 88413f2a8..dc50cc26e 100644 --- a/OpenCL/m11700_a3.cl +++ b/OpenCL/m11700_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -32,6 +33,8 @@ #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define SBOG_LPSti64 \ diff --git a/OpenCL/m11800_a0.cl b/OpenCL/m11800_a0.cl index 8d9c8ab25..e7a681ae9 100644 --- a/OpenCL/m11800_a0.cl +++ b/OpenCL/m11800_a0.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -34,6 +35,8 @@ #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define SBOG_LPSti64 \ diff --git a/OpenCL/m11800_a3.cl b/OpenCL/m11800_a3.cl index e61090b42..0f61a7224 100644 --- a/OpenCL/m11800_a3.cl +++ b/OpenCL/m11800_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -32,6 +33,8 @@ #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) #elif VECT_SIZE == 8 #define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#elif VECT_SIZE == 16 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7], (S)[(n)][(i).s8], (S)[(n)][(i).s9], (S)[(n)][(i).sa], (S)[(n)][(i).sb], (S)[(n)][(i).sc], (S)[(n)][(i).sd], (S)[(n)][(i).se], (S)[(n)][(i).sf]) #endif #define SBOG_LPSti64 \ diff --git a/OpenCL/m12600_a0.cl b/OpenCL/m12600_a0.cl index 2d16d83e9..948d3a2b4 100644 --- a/OpenCL/m12600_a0.cl +++ b/OpenCL/m12600_a0.cl @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -30,6 +32,8 @@ #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 rules_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl index 8501bd538..fdac3adc6 100644 --- a/OpenCL/m12600_a3.cl +++ b/OpenCL/m12600_a3.cl @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -30,6 +31,8 @@ #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) #elif VECT_SIZE == 8 #define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#elif VECT_SIZE == 16 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf]) #endif static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 *l_bin2asc) diff --git a/OpenCL/rp.c b/OpenCL/rp.c index b495d15dc..6dda7c020 100644 --- a/OpenCL/rp.c +++ b/OpenCL/rp.c @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -4238,6 +4240,96 @@ u32 apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_l w1[3].s7 = tmp1[3]; break; #endif + + #if VECT_SIZE >= 16 + case 8: + w0[0].s8 = tmp0[0]; + w0[1].s8 = tmp0[1]; + w0[2].s8 = tmp0[2]; + w0[3].s8 = tmp0[3]; + w1[0].s8 = tmp1[0]; + w1[1].s8 = tmp1[1]; + w1[2].s8 = tmp1[2]; + w1[3].s8 = tmp1[3]; + break; + + case 9: + w0[0].s9 = tmp0[0]; + w0[1].s9 = tmp0[1]; + w0[2].s9 = tmp0[2]; + w0[3].s9 = tmp0[3]; + w1[0].s9 = tmp1[0]; + w1[1].s9 = tmp1[1]; + w1[2].s9 = tmp1[2]; + w1[3].s9 = tmp1[3]; + break; + + case 10: + w0[0].sa = tmp0[0]; + w0[1].sa = tmp0[1]; + w0[2].sa = tmp0[2]; + w0[3].sa = tmp0[3]; + w1[0].sa = tmp1[0]; + w1[1].sa = tmp1[1]; + w1[2].sa = tmp1[2]; + w1[3].sa = tmp1[3]; + break; + + case 11: + w0[0].sb = tmp0[0]; + w0[1].sb = tmp0[1]; + w0[2].sb = tmp0[2]; + w0[3].sb = tmp0[3]; + w1[0].sb = tmp1[0]; + w1[1].sb = tmp1[1]; + w1[2].sb = tmp1[2]; + w1[3].sb = tmp1[3]; + break; + + case 12: + w0[0].sc = tmp0[0]; + w0[1].sc = tmp0[1]; + w0[2].sc = tmp0[2]; + w0[3].sc = tmp0[3]; + w1[0].sc = tmp1[0]; + w1[1].sc = tmp1[1]; + w1[2].sc = tmp1[2]; + w1[3].sc = tmp1[3]; + break; + + case 13: + w0[0].sd = tmp0[0]; + w0[1].sd = tmp0[1]; + w0[2].sd = tmp0[2]; + w0[3].sd = tmp0[3]; + w1[0].sd = tmp1[0]; + w1[1].sd = tmp1[1]; + w1[2].sd = tmp1[2]; + w1[3].sd = tmp1[3]; + break; + + case 14: + w0[0].se = tmp0[0]; + w0[1].se = tmp0[1]; + w0[2].se = tmp0[2]; + w0[3].se = tmp0[3]; + w1[0].se = tmp1[0]; + w1[1].se = tmp1[1]; + w1[2].se = tmp1[2]; + w1[3].se = tmp1[3]; + break; + + case 15: + w0[0].sf = tmp0[0]; + w0[1].sf = tmp0[1]; + w0[2].sf = tmp0[2]; + w0[3].sf = tmp0[3]; + w1[0].sf = tmp1[0]; + w1[1].sf = tmp1[1]; + w1[2].sf = tmp1[2]; + w1[3].sf = tmp1[3]; + break; + #endif } } diff --git a/OpenCL/simd.c b/OpenCL/simd.c index fcab5e3ac..b520aefd1 100644 --- a/OpenCL/simd.c +++ b/OpenCL/simd.c @@ -1,3 +1,9 @@ +/** + * Authors.....: Jens Steube + * magnum + * + * License.....: MIT + */ // vliw1 @@ -586,6 +592,580 @@ #endif +// vliw16 + +#if VECT_SIZE == 16 + +#define MATCHES_ONE_VV(a,b) (((a).s0 == (b).s0) || ((a).s1 == (b).s1) || ((a).s2 == (b).s2) || ((a).s3 == (b).s3) || ((a).s4 == (b).s4) || ((a).s5 == (b).s5) || ((a).s6 == (b).s6) || ((a).s7 == (b).s7) || ((a).s8 == (b).s8) || ((a).s9 == (b).s9) || ((a).sa == (b).sa) || ((a).sb == (b).sb) || ((a).sc == (b).sc) || ((a).sd == (b).sd) || ((a).se == (b).se) || ((a).sf == (b).sf)) +#define MATCHES_ONE_VS(a,b) (((a).s0 == (b) ) || ((a).s1 == (b) ) || ((a).s2 == (b) ) || ((a).s3 == (b) ) || ((a).s4 == (b) ) || ((a).s5 == (b) ) || ((a).s6 == (b) ) || ((a).s7 == (b) ) || ((a).s8 == (b) ) || ((a).s9 == (b) ) || ((a).sa == (b) ) || ((a).sb == (b) ) || ((a).sc == (b) ) || ((a).sd == (b) ) || ((a).se == (b) ) || ((a).sf == (b) )) + +#define COMPARE_S_SIMD(h0,h1,h2,h3) \ +{ \ + if (((h0).s0 == search[0]) && ((h1).s0 == search[1]) && ((h2).s0 == search[2]) && ((h3).s0 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s1 == search[0]) && ((h1).s1 == search[1]) && ((h2).s1 == search[2]) && ((h3).s1 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s2 == search[0]) && ((h1).s2 == search[1]) && ((h2).s2 == search[2]) && ((h3).s2 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s3 == search[0]) && ((h1).s3 == search[1]) && ((h2).s3 == search[2]) && ((h3).s3 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + if (((h0).s4 == search[0]) && ((h1).s4 == search[1]) && ((h2).s4 == search[2]) && ((h3).s4 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 4); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s5 == search[0]) && ((h1).s5 == search[1]) && ((h2).s5 == search[2]) && ((h3).s5 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 5); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s6 == search[0]) && ((h1).s6 == search[1]) && ((h2).s6 == search[2]) && ((h3).s6 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 6); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s7 == search[0]) && ((h1).s7 == search[1]) && ((h2).s7 == search[2]) && ((h3).s7 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 7); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s8 == search[0]) && ((h1).s8 == search[1]) && ((h2).s8 == search[2]) && ((h3).s8 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 8); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).s9 == search[0]) && ((h1).s9 == search[1]) && ((h2).s9 == search[2]) && ((h3).s9 == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 9); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).sa == search[0]) && ((h1).sa == search[1]) && ((h2).sa == search[2]) && ((h3).sa == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 10); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).sb == search[0]) && ((h1).sb == search[1]) && ((h2).sb == search[2]) && ((h3).sb == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 11); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).sc == search[0]) && ((h1).sc == search[1]) && ((h2).sc == search[2]) && ((h3).sc == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 12); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).sd == search[0]) && ((h1).sd == search[1]) && ((h2).sd == search[2]) && ((h3).sd == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 13); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).se == search[0]) && ((h1).se == search[1]) && ((h2).se == search[2]) && ((h3).se == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 14); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + \ + if (((h0).sf == search[0]) && ((h1).sf == search[1]) && ((h2).sf == search[2]) && ((h3).sf == search[3])) \ + { \ + const u32 final_hash_pos = digests_offset + 0; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 15); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ +} + +#define COMPARE_M_SIMD(h0,h1,h2,h3) \ +{ \ + const u32 digest_tp0[4] = { h0.s0, h1.s0, h2.s0, h3.s0 }; \ + const u32 digest_tp1[4] = { h0.s1, h1.s1, h2.s1, h3.s1 }; \ + const u32 digest_tp2[4] = { h0.s2, h1.s2, h2.s2, h3.s2 }; \ + const u32 digest_tp3[4] = { h0.s3, h1.s3, h2.s3, h3.s3 }; \ + const u32 digest_tp4[4] = { h0.s4, h1.s4, h2.s4, h3.s4 }; \ + const u32 digest_tp5[4] = { h0.s5, h1.s5, h2.s5, h3.s5 }; \ + const u32 digest_tp6[4] = { h0.s6, h1.s6, h2.s6, h3.s6 }; \ + const u32 digest_tp7[4] = { h0.s7, h1.s7, h2.s7, h3.s7 }; \ + const u32 digest_tp8[4] = { h0.s8, h1.s8, h2.s8, h3.s8 }; \ + const u32 digest_tp9[4] = { h0.s9, h1.s9, h2.s9, h3.s9 }; \ + const u32 digest_tp10[4] = { h0.sa, h1.sa, h2.sa, h3.sa }; \ + const u32 digest_tp11[4] = { h0.sb, h1.sb, h2.sb, h3.sb }; \ + const u32 digest_tp12[4] = { h0.sc, h1.sc, h2.sc, h3.sc }; \ + const u32 digest_tp13[4] = { h0.sd, h1.sd, h2.sd, h3.sd }; \ + const u32 digest_tp14[4] = { h0.se, h1.se, h2.se, h3.se }; \ + const u32 digest_tp15[4] = { h0.sf, h1.sf, h2.sf, h3.sf }; \ + \ + if (check (digest_tp0, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp0, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp1, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp1, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp2, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp2, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp3, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp3, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + if (check (digest_tp4, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp4, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 4); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp5, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp5, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 5); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp6, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp6, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 6); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp7, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp7, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 7); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp8, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp8, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 8); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp9, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp9, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 9); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp10, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp10, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 10); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp11, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp11, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 11); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp12, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp12, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 12); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp13, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp13, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 13); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp14, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp14, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 14); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ + \ + if (check (digest_tp15, \ + bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \ + bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \ + bitmap_mask, \ + bitmap_shift1, \ + bitmap_shift2)) \ + { \ + int hash_pos = find_hash (digest_tp15, digests_cnt, &digests_buf[digests_offset]); \ + \ + if (hash_pos != -1) \ + { \ + const u32 final_hash_pos = digests_offset + hash_pos; \ + \ + if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \ + { \ + mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 15); \ + \ + d_return_buf[lid] = 1; \ + } \ + } \ + } \ +} + +#endif + #define MATCHES_NONE_VV(a,b) !(MATCHES_ONE_VV ((a), (b))) #define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b))) @@ -601,6 +1181,8 @@ static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos) const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i); #elif VECT_SIZE == 8 const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i); + #elif VECT_SIZE == 16 + const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i); #endif return w0r; @@ -614,6 +1196,8 @@ static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos) #define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)]) #elif VECT_SIZE == 8 #define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)]) +#elif VECT_SIZE == 16 +#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)], (arr)[((gid) * 8) + 8].var[(idx)], (arr)[((gid) * 8) + 9].var[(idx)], (arr)[((gid) * 8) + 10].var[(idx)], (arr)[((gid) * 8) + 11].var[(idx)], (arr)[((gid) * 8) + 12].var[(idx)], (arr)[((gid) * 8) + 13].var[(idx)], (arr)[((gid) * 8) + 14].var[(idx)], (arr)[((gid) * 8) + 15].var[(idx)]) #endif #if VECT_SIZE == 1 @@ -624,5 +1208,6 @@ static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos) #define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 4) + 0].var[(idx)] = val.s0; (arr)[((gid) * 4) + 1].var[(idx)] = val.s1; (arr)[((gid) * 4) + 2].var[(idx)] = val.s2; (arr)[((gid) * 4) + 3].var[(idx)] = val.s3; #elif VECT_SIZE == 8 #define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7; +#elif VECT_SIZE == 16 +#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7; (arr)[((gid) * 8) + 8].var[(idx)] = val.s8; (arr)[((gid) * 8) + 9].var[(idx)] = val.s9; (arr)[((gid) * 8) + 10].var[(idx)] = val.sa; (arr)[((gid) * 8) + 11].var[(idx)] = val.sb; (arr)[((gid) * 8) + 12].var[(idx)] = val.sc; (arr)[((gid) * 8) + 13].var[(idx)] = val.sd; (arr)[((gid) * 8) + 14].var[(idx)] = val.se; (arr)[((gid) * 8) + 15].var[(idx)] = val.sf; #endif - diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index c735ac516..29c728b71 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ @@ -16,32 +18,19 @@ typedef ulong u64; #define VECT_SIZE 1 #endif +#define CONCAT(a, b) a##b +#define VTYPE(type, width) CONCAT(type, width) + #if VECT_SIZE == 1 typedef uchar u8x; typedef ushort u16x; typedef uint u32x; typedef ulong u64x; -#endif - -#if VECT_SIZE == 2 -typedef uchar2 u8x; -typedef ushort2 u16x; -typedef uint2 u32x; -typedef ulong2 u64x; -#endif - -#if VECT_SIZE == 4 -typedef uchar4 u8x; -typedef ushort4 u16x; -typedef uint4 u32x; -typedef ulong4 u64x; -#endif - -#if VECT_SIZE == 8 -typedef uchar8 u8x; -typedef ushort8 u16x; -typedef uint8 u32x; -typedef ulong8 u64x; +#else +typedef VTYPE(uchar, VECT_SIZE) u8x; +typedef VTYPE(ushort, VECT_SIZE) u16x; +typedef VTYPE(uint, VECT_SIZE) u32x; +typedef VTYPE(ulong, VECT_SIZE) u64x; #endif // this one needs to die @@ -93,6 +82,17 @@ static inline u32x l32_from_64 (u64x a) r.s7 = (u32) a.s7; #endif + #if VECT_SIZE >= 16 + r.s8 = (u32) a.s8; + r.s9 = (u32) a.s9; + r.sa = (u32) a.sa; + r.sb = (u32) a.sb; + r.sc = (u32) a.sc; + r.sd = (u32) a.sd; + r.se = (u32) a.se; + r.sf = (u32) a.sf; + #endif + return r; } @@ -123,6 +123,17 @@ static inline u32x h32_from_64 (u64x a) r.s7 = (u32) a.s7; #endif + #if VECT_SIZE >= 16 + r.s8 = (u32) a.s8; + r.s9 = (u32) a.s9; + r.sa = (u32) a.sa; + r.sb = (u32) a.sb; + r.sc = (u32) a.sc; + r.sd = (u32) a.sd; + r.se = (u32) a.se; + r.sf = (u32) a.sf; + #endif + return r; } @@ -151,6 +162,17 @@ static inline u64x hl32_to_64 (const u32x a, const u32x b) r.s7 = as_ulong ((uint2) (b.s7, a.s7)); #endif + #if VECT_SIZE >= 16 + r.s8 = as_ulong ((uint2) (b.s8, a.s8)); + r.s9 = as_ulong ((uint2) (b.s9, a.s9)); + r.sa = as_ulong ((uint2) (b.sa, a.sa)); + r.sb = as_ulong ((uint2) (b.sb, a.sb)); + r.sc = as_ulong ((uint2) (b.sc, a.sc)); + r.sd = as_ulong ((uint2) (b.sd, a.sd)); + r.se = as_ulong ((uint2) (b.se, a.se)); + r.sf = as_ulong ((uint2) (b.sf, a.sf)); + #endif + return r; } @@ -629,6 +651,146 @@ static inline u64x rotr64 (const u64x a, const u32 n) #endif + #if VECT_SIZE >= 16 + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s8)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s8) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s9)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s9) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sa)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sa) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sb)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sb) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sc)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sc) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sd)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sd) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.se)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.se) : "r"(tl), "r"(tr)); + } + + { + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sf)); + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sf) : "r"(tl), "r"(tr)); + } + + #endif + return r; } #else @@ -668,6 +830,17 @@ static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s8) : "r"(a.s8), "r"(b.s8), "r"(c.s8)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s9) : "r"(a.s9), "r"(b.s9), "r"(c.s9)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sa) : "r"(a.sa), "r"(b.sa), "r"(c.sa)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sb) : "r"(a.sb), "r"(b.sb), "r"(c.sb)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sc) : "r"(a.sc), "r"(b.sc), "r"(c.sc)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sd) : "r"(a.sd), "r"(b.sd), "r"(c.sd)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.se) : "r"(a.se), "r"(b.se), "r"(c.se)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.sf) : "r"(a.sf), "r"(b.sf), "r"(c.sf)); + #endif + return r; } @@ -722,6 +895,17 @@ static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -733,29 +917,34 @@ static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -767,29 +956,34 @@ static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -801,29 +995,34 @@ static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -835,29 +1034,34 @@ static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -869,29 +1073,34 @@ static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -903,29 +1112,34 @@ static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); #endif + #if VECT_SIZE >= 16 + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); + #endif + return r; } @@ -1044,6 +1258,12 @@ static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7); #endif + + #if VECT_SIZE == 16 + const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7, a.s8, a.s9, a.sa, a.sb, a.sc, a.sd, a.se, a.sf)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7, b.s8, b.s9, b.sa, b.sb, b.sc, b.sd, b.se, b.sf))) >> ((c & 3) * 8); + + return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7, tmp.s8, tmp.s9, tmp.sa, tmp.sb, tmp.sc, tmp.sd, tmp.se, tmp.sf); + #endif } #endif diff --git a/docs/BUILD.md b/docs/BUILD.md index 1413982cb..60a9e6d91 100644 --- a/docs/BUILD.md +++ b/docs/BUILD.md @@ -6,6 +6,7 @@ oclHashcat build documentation # Authors: * Gabriele Gristina <> * Christoph Heuwieser <> +* magnum <> # Building oclHashcat for Linux and OSX @@ -39,7 +40,7 @@ Get a copy of the **oclHashcat** repository $ git clone https://github.com/hashcat/oclHashcat.git ``` -Basically all you need is the OpenCL Headers. +Basically all you need is the OpenCL Headers. Simply clone into the reference Implementation: diff --git a/docs/changes.txt b/docs/changes.txt index 3b0b65728..04325887e 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -23,6 +23,11 @@ Type.: Feature File.: Kernel Desc.: Extended support from 14 to 255 functions calls per rule on GPU +Type.: Feature +File.: Kernel +Desc.: Added support for vector width 16 +Issue: 226 + Type.: Feature File.: Host Desc.: Added support to utilize multiple different OpenCL platforms in parallel, ex: AMD + NV diff --git a/extra/tab_completion/oclHashcat.sh b/extra/tab_completion/oclHashcat.sh index 0c7f7d0cc..8a2a42927 100644 --- a/extra/tab_completion/oclHashcat.sh +++ b/extra/tab_completion/oclHashcat.sh @@ -1,3 +1,9 @@ +## Authors.....: Jens Steube +## magnum +## +## License.....: MIT +## + OCLHASHCAT_ROOT="." # helper functions @@ -175,7 +181,7 @@ _oclHashcat () local ATTACK_MODES="0 1 3 6 7" local OUTFILE_FORMATS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" local OPENCL_DEVICE_TYPES="1 2 3" - local OPENCL_VECTOR_WIDTH="1 2 4 8" + local OPENCL_VECTOR_WIDTH="1 2 4 8 16" local DEBUG_MODE="1 2 3 4" local WORKLOAD_PROFILE="1 2 3" local HIDDEN_FILES="exe|bin|pot|hcstat|dictstat|accepted|sh|cmd|bat|restore" diff --git a/include/common.h b/include/common.h index 980326521..1b784a342 100644 --- a/include/common.h +++ b/include/common.h @@ -1,5 +1,7 @@ /** - * Author......: Jens Steube + * Authors.....: Jens Steube + * magnum + * * License.....: MIT */ diff --git a/include/ext_ADL.h b/include/ext_ADL.h index 5c30d27fa..b90128b0c 100644 --- a/include/ext_ADL.h +++ b/include/ext_ADL.h @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ diff --git a/include/ext_nvml.h b/include/ext_nvml.h index aebf8987a..3b4edfce1 100644 --- a/include/ext_nvml.h +++ b/include/ext_nvml.h @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ diff --git a/include/shared.h b/include/shared.h index 4b4f9b3f3..4e437b0f0 100644 --- a/include/shared.h +++ b/include/shared.h @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -1268,6 +1269,8 @@ extern hc_thread_mutex_t mux_display; * functions */ +u32 is_power_of_2(u32 v); + u32 rotl32 (const u32 a, const u32 n); u32 rotr32 (const u32 a, const u32 n); u64 rotl64 (const u64 a, const u64 n); diff --git a/src/Makefile b/src/Makefile index 1befee06c..91660d9b6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ ## ## Authors.....: Jens Steube ## Gabriele Gristina +## magnum ## ## License.....: MIT ## diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 49bae1c07..efc8d7cc8 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -394,7 +395,7 @@ const char *USAGE_BIG[] = " --opencl-platforms=STR OpenCL platforms to use, separate with comma", " -d, --opencl-devices=STR OpenCL devices to use, separate with comma", " --opencl-device-types=STR OpenCL device-types to use, separate with comma, see references below", - " --opencl-vector-width=NUM OpenCL vector-width (either 1, 2, 4 or 8), overrides value from device query", + " --opencl-vector-width=NUM OpenCL vector-width (either 1, 2, 4, 8 or 16), overrides value from device query", " -w, --workload-profile=NUM Enable a specific workload profile, see references below", " -n, --kernel-accel=NUM Workload tuning, increase the outer-loop step size", " -u, --kernel-loops=NUM Workload tuning, increase the inner-loop step size", @@ -6306,7 +6307,7 @@ int main (int argc, char **argv) return (-1); } - if ((opencl_vector_width != 0) && (opencl_vector_width != 1) && (opencl_vector_width != 2) && (opencl_vector_width != 4) && (opencl_vector_width != 8)) + if (opencl_vector_width_chgd && (!is_power_of_2(opencl_vector_width) || opencl_vector_width > 16)) { log_error ("ERROR: opencl-vector-width %i not allowed", opencl_vector_width); @@ -12803,7 +12804,7 @@ int main (int argc, char **argv) vector_width = opencl_vector_width; } - if (vector_width > 8) vector_width = 8; + if (vector_width > 16) vector_width = 16; device_param->vector_width = vector_width; diff --git a/src/shared.c b/src/shared.c index d08c2bad4..86134c545 100644 --- a/src/shared.c +++ b/src/shared.c @@ -1,6 +1,7 @@ /** * Authors.....: Jens Steube * Gabriele Gristina + * magnum * * License.....: MIT */ @@ -16,6 +17,11 @@ * basic bit handling */ +u32 is_power_of_2(u32 v) +{ + return (v && !(v & (v - 1))); +} + u32 rotl32 (const u32 a, const u32 n) { return ((a << n) | (a >> (32 - n))); diff --git a/tools/test.sh b/tools/test.sh index 2a9b20ce3..3a0577e35 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -3,6 +3,7 @@ ## ## Authors.....: Gabriele Gristina ## Jens Steube +## magnum ## ## License.....: MIT ## @@ -14,7 +15,7 @@ HASH_TYPES="0 10 11 12 20 21 22 23 30 40 50 60 100 101 110 111 112 120 121 122 1 #ATTACK_MODES="0 1 3 6 7" ATTACK_MODES="0 1 3 7" -VECTOR_WIDTHS="1 2 4 8" +VECTOR_WIDTHS="1 2 4 8 16" MATCH_PASS_ONLY="2500 5300 5400 6600 6800 8200" @@ -1598,6 +1599,8 @@ while getopts "V:T:t:m:a:b:hcpd:x:o:" opt; do VECTOR=4 elif [ ${OPTARG} == "8" ]; then VECTOR=8 + elif [ ${OPTARG} == "16" ]; then + VECTOR=16 elif [ ${OPTARG} == "all" ]; then VECTOR="all" else