diff --git a/.gitignore b/.gitignore index b268eedfb..99a9615aa 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ obj/*.a include/CL tools/luks_tests .vscode +test_edge* # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index d029a2bf3..e0a9f28ed 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -1781,6 +1781,7 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c) return r; } +#ifdef USE_FUNNELSHIFT DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c) { const int c_mod_4 = c & 3; @@ -1788,35 +1789,35 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c) u32x r; #if VECT_SIZE == 1 - r = __funnelshift_r (b, a, c_mod_4 * 8); + r = hc_funnelshift_r (b, a, c_mod_4 * 8); #endif #if VECT_SIZE >= 2 - r.s0 = __funnelshift_r (b.s0, a.s0, c_mod_4 * 8); - r.s1 = __funnelshift_r (b.s1, a.s1, c_mod_4 * 8); + r.s0 = hc_funnelshift_r (b.s0, a.s0, c_mod_4 * 8); + r.s1 = hc_funnelshift_r (b.s1, a.s1, c_mod_4 * 8); #endif #if VECT_SIZE >= 4 - r.s2 = __funnelshift_r (b.s2, a.s2, c_mod_4 * 8); - r.s3 = __funnelshift_r (b.s3, a.s3, c_mod_4 * 8); + r.s2 = hc_funnelshift_r (b.s2, a.s2, c_mod_4 * 8); + r.s3 = hc_funnelshift_r (b.s3, a.s3, c_mod_4 * 8); #endif #if VECT_SIZE >= 8 - r.s4 = __funnelshift_r (b.s4, a.s4, c_mod_4 * 8); - r.s5 = __funnelshift_r (b.s5, a.s5, c_mod_4 * 8); - r.s6 = __funnelshift_r (b.s6, a.s6, c_mod_4 * 8); - r.s7 = __funnelshift_r (b.s7, a.s7, c_mod_4 * 8); + r.s4 = hc_funnelshift_r (b.s4, a.s4, c_mod_4 * 8); + r.s5 = hc_funnelshift_r (b.s5, a.s5, c_mod_4 * 8); + r.s6 = hc_funnelshift_r (b.s6, a.s6, c_mod_4 * 8); + r.s7 = hc_funnelshift_r (b.s7, a.s7, c_mod_4 * 8); #endif #if VECT_SIZE >= 16 - r.s8 = __funnelshift_r (b.s8, a.s8, c_mod_4 * 8); - r.s9 = __funnelshift_r (b.s9, a.s9, c_mod_4 * 8); - r.sa = __funnelshift_r (b.sa, a.sa, c_mod_4 * 8); - r.sb = __funnelshift_r (b.sb, a.sb, c_mod_4 * 8); - r.sc = __funnelshift_r (b.sc, a.sc, c_mod_4 * 8); - r.sd = __funnelshift_r (b.sd, a.sd, c_mod_4 * 8); - r.se = __funnelshift_r (b.se, a.se, c_mod_4 * 8); - r.sf = __funnelshift_r (b.sf, a.sf, c_mod_4 * 8); + r.s8 = hc_funnelshift_r (b.s8, a.s8, c_mod_4 * 8); + r.s9 = hc_funnelshift_r (b.s9, a.s9, c_mod_4 * 8); + r.sa = hc_funnelshift_r (b.sa, a.sa, c_mod_4 * 8); + r.sb = hc_funnelshift_r (b.sb, a.sb, c_mod_4 * 8); + r.sc = hc_funnelshift_r (b.sc, a.sc, c_mod_4 * 8); + r.sd = hc_funnelshift_r (b.sd, a.sd, c_mod_4 * 8); + r.se = hc_funnelshift_r (b.se, a.se, c_mod_4 * 8); + r.sf = hc_funnelshift_r (b.sf, a.sf, c_mod_4 * 8); #endif return r; @@ -1826,7 +1827,7 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c) { const int c_mod_4 = c & 3; - const u32 r = __funnelshift_r (b, a, c_mod_4 * 8); + const u32 r = hc_funnelshift_r (b, a, c_mod_4 * 8); return r; } @@ -1838,35 +1839,35 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c) u32x r; #if VECT_SIZE == 1 - r = __funnelshift_l (a, b, c_mod_4 * 8); + r = hc_funnelshift_l (a, b, c_mod_4 * 8); #endif #if VECT_SIZE >= 2 - r.s0 = __funnelshift_l (a.s0, b.s0, c_mod_4 * 8); - r.s1 = __funnelshift_l (a.s1, b.s1, c_mod_4 * 8); + r.s0 = hc_funnelshift_l (a.s0, b.s0, c_mod_4 * 8); + r.s1 = hc_funnelshift_l (a.s1, b.s1, c_mod_4 * 8); #endif #if VECT_SIZE >= 4 - r.s2 = __funnelshift_l (a.s2, b.s2, c_mod_4 * 8); - r.s3 = __funnelshift_l (a.s3, b.s3, c_mod_4 * 8); + r.s2 = hc_funnelshift_l (a.s2, b.s2, c_mod_4 * 8); + r.s3 = hc_funnelshift_l (a.s3, b.s3, c_mod_4 * 8); #endif #if VECT_SIZE >= 8 - r.s4 = __funnelshift_l (a.s4, b.s4, c_mod_4 * 8); - r.s5 = __funnelshift_l (a.s5, b.s5, c_mod_4 * 8); - r.s6 = __funnelshift_l (a.s6, b.s6, c_mod_4 * 8); - r.s7 = __funnelshift_l (a.s7, b.s7, c_mod_4 * 8); + r.s4 = hc_funnelshift_l (a.s4, b.s4, c_mod_4 * 8); + r.s5 = hc_funnelshift_l (a.s5, b.s5, c_mod_4 * 8); + r.s6 = hc_funnelshift_l (a.s6, b.s6, c_mod_4 * 8); + r.s7 = hc_funnelshift_l (a.s7, b.s7, c_mod_4 * 8); #endif #if VECT_SIZE >= 16 - r.s8 = __funnelshift_l (a.s8, b.s8, c_mod_4 * 8); - r.s9 = __funnelshift_l (a.s9, b.s9, c_mod_4 * 8); - r.sa = __funnelshift_l (a.sa, b.sa, c_mod_4 * 8); - r.sb = __funnelshift_l (a.sb, b.sb, c_mod_4 * 8); - r.sc = __funnelshift_l (a.sc, b.sc, c_mod_4 * 8); - r.sd = __funnelshift_l (a.sd, b.sd, c_mod_4 * 8); - r.se = __funnelshift_l (a.se, b.se, c_mod_4 * 8); - r.sf = __funnelshift_l (a.sf, b.sf, c_mod_4 * 8); + r.s8 = hc_funnelshift_l (a.s8, b.s8, c_mod_4 * 8); + r.s9 = hc_funnelshift_l (a.s9, b.s9, c_mod_4 * 8); + r.sa = hc_funnelshift_l (a.sa, b.sa, c_mod_4 * 8); + r.sb = hc_funnelshift_l (a.sb, b.sb, c_mod_4 * 8); + r.sc = hc_funnelshift_l (a.sc, b.sc, c_mod_4 * 8); + r.sd = hc_funnelshift_l (a.sd, b.sd, c_mod_4 * 8); + r.se = hc_funnelshift_l (a.se, b.se, c_mod_4 * 8); + r.sf = hc_funnelshift_l (a.sf, b.sf, c_mod_4 * 8); #endif return r; @@ -1876,10 +1877,51 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c) { const int c_mod_4 = c & 3; - const u32 r = __funnelshift_l (a, b, c_mod_4 * 8); + const u32 r = hc_funnelshift_l (a, b, c_mod_4 * 8); return r; } +#else +DECLSPEC u32 hc_bytealign_be (const u32 a, const u32 b, const int c) +{ + const int c_mod_4 = c & 3; + + const u32 r = hc_byte_perm (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff); + + return r; +} + +DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c) +{ + const int c_mod_4 = c & 3; + + const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff); + + return r; +} + +DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c) +{ + const int c_mod_4 = c & 3; + + const int c_minus_4 = 4 - c_mod_4; + + const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff); + + return r; +} + +DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c) +{ + const int c_mod_4 = c & 3; + + const int c_minus_4 = 4 - c_mod_4; + + const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff); + + return r; +} +#endif DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c) { diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 0a386f22e..284dbf02a 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -6,6 +6,7 @@ #include "inc_vendor.h" #include "inc_types.h" #include "inc_platform.h" +#include "inc_common.h" #ifdef IS_NATIVE #define FIXED_THREAD_COUNT(n) @@ -60,6 +61,33 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n) #endif // IS_AMD +// this applies to cuda and opencl +#if defined IS_NV + +#ifdef USE_FUNNELSHIFT + +DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift) +{ + u32 result; + + asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift)); + + return result; +} + +DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift) +{ + u32 result; + + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift)); + + return result; +} + +#endif + +#endif // IS_NV + #if defined IS_CUDA #if ATTACK_EXEC == 11 @@ -243,7 +271,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n) DECLSPEC u32 rotl32_S (const u32 a, const int n) { #ifdef USE_FUNNELSHIFT - return __funnelshift_l (a, a, n); + return hc_funnelshift_l (a, a, n); #else return ((a << n) | ((a >> (32 - n)))); #endif @@ -252,7 +280,7 @@ DECLSPEC u32 rotl32_S (const u32 a, const int n) DECLSPEC u32 rotr32_S (const u32 a, const int n) { #ifdef USE_FUNNELSHIFT - return __funnelshift_r (a, a, n); + return hc_funnelshift_r (a, a, n); #else return ((a >> n) | ((a << (32 - n)))); #endif @@ -472,29 +500,17 @@ DECLSPEC u32x rotr32 (const u32x a, const int n) DECLSPEC u32 rotl32_S (const u32 a, const int n) { - #ifdef USE_FUNNELSHIFT - return __funnelshift_l (a, a, n); - #else - return ((a << n) | ((a >> (32 - n)))); - #endif + return rotr32_S (a, 32 - n); } DECLSPEC u32 rotr32_S (const u32 a, const int n) { - #ifdef USE_FUNNELSHIFT - return __funnelshift_r (a, a, n); - #else - return ((a >> n) | ((a << (32 - n)))); - #endif + return __builtin_amdgcn_alignbit (a, a, n); } DECLSPEC u64x rotl64 (const u64x a, const int n) { - #if VECT_SIZE == 1 - return rotl64_S (a, n); - #else - return ((a << n) | ((a >> (64 - n)))); - #endif + return rotr64 (a, 64 - n); } DECLSPEC u64x rotr64 (const u64x a, const int n) @@ -513,7 +529,6 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n) DECLSPEC u64 rotr64_S (const u64 a, const int n) { - #ifdef USE_FUNNELSHIFT vconv64_t in; in.v64 = a; @@ -523,21 +538,20 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n) vconv64_t out; + const int n31 = n & 31; + if (n < 32) { - out.v32.a = __funnelshift_r (a0, a1, n); - out.v32.b = __funnelshift_r (a1, a0, n); + out.v32.a = __builtin_amdgcn_alignbit (a1, a0, n31); + out.v32.b = __builtin_amdgcn_alignbit (a0, a1, n31); } else { - out.v32.a = __funnelshift_r (a1, a0, n - 32); - out.v32.b = __funnelshift_r (a0, a1, n - 32); + out.v32.a = __builtin_amdgcn_alignbit (a0, a1, n31); + out.v32.b = __builtin_amdgcn_alignbit (a1, a0, n31); } return out.v64; - #else - return ((a >> n) | ((a << (64 - n)))); - #endif } #define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0) diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h index 9729d4fad..800988056 100644 --- a/OpenCL/inc_platform.h +++ b/OpenCL/inc_platform.h @@ -21,6 +21,11 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n); DECLSPEC u64 rotr64_S (const u64 a, const int n); #endif // IS_AMD +#ifdef IS_NV +DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift); +DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift); +#endif // IS_NV + #ifdef IS_CUDA DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p); DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p); diff --git a/OpenCL/m06900_a3-optimized.cl b/OpenCL/m06900_a3-optimized.cl index d71302d55..064eb3e58 100644 --- a/OpenCL/m06900_a3-optimized.cl +++ b/OpenCL/m06900_a3-optimized.cl @@ -1190,6 +1190,71 @@ KERNEL_FQ KERNEL_FA void m06900_m08 (KERN_ATTR_BASIC ()) KERNEL_FQ KERNEL_FA void m06900_m16 (KERN_ATTR_BASIC ()) { + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * sbox + */ + + LOCAL_VK u32 s_tables[4][256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_tables[0][i] = c_tables[0][i]; + s_tables[1][i] = c_tables[1][i]; + s_tables[2][i] = c_tables[2][i]; + s_tables[3][i] = c_tables[3][i]; + } + + SYNC_THREADS (); + + if (gid >= GID_CNT) return; + + /** + * modifier + */ + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; // no change here, because m06900m() doesn't support > 32, but we need a _m16 kernel because hashcat will call _m16 if pw_len >= 32 + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m06900m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables); } KERNEL_FQ KERNEL_FA void m06900_s04 (KERN_ATTR_BASIC ()) @@ -1332,4 +1397,69 @@ KERNEL_FQ KERNEL_FA void m06900_s08 (KERN_ATTR_BASIC ()) KERNEL_FQ KERNEL_FA void m06900_s16 (KERN_ATTR_BASIC ()) { + /** + * base + */ + + const u64 lid = get_local_id (0); + const u64 gid = get_global_id (0); + const u64 lsz = get_local_size (0); + + /** + * sbox + */ + + LOCAL_VK u32 s_tables[4][256]; + + for (u32 i = lid; i < 256; i += lsz) + { + s_tables[0][i] = c_tables[0][i]; + s_tables[1][i] = c_tables[1][i]; + s_tables[2][i] = c_tables[2][i]; + s_tables[3][i] = c_tables[3][i]; + } + + SYNC_THREADS (); + + if (gid >= GID_CNT) return; + + /** + * modifier + */ + + u32 w0[4]; + + w0[0] = pws[gid].i[ 0]; + w0[1] = pws[gid].i[ 1]; + w0[2] = pws[gid].i[ 2]; + w0[3] = pws[gid].i[ 3]; + + u32 w1[4]; + + w1[0] = pws[gid].i[ 4]; + w1[1] = pws[gid].i[ 5]; + w1[2] = pws[gid].i[ 6]; + w1[3] = pws[gid].i[ 7]; + + u32 w2[4]; // no change here, because m06900s() doesn't support > 32, but we need a _s16 kernel because hashcat will call _s16 if pw_len >= 32 + + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + + u32 w3[4]; + + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 0; + + const u32 pw_len = pws[gid].pw_len & 63; + + /** + * main + */ + + m06900s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables); }