diff --git a/OpenCL/inc_hash_functions.cl b/OpenCL/inc_hash_functions.cl index 802cab90d..06225acf9 100644 --- a/OpenCL/inc_hash_functions.cl +++ b/OpenCL/inc_hash_functions.cl @@ -130,20 +130,18 @@ #define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z))) #endif -#define SHA1_STEP_S(f,a,b,c,d,e,x) \ -{ \ - e += K; \ - e = __add3_S (e, x, f (b, c, d)); \ - e += rotl32_S (a, 5u); \ - b = rotl32_S (b, 30u); \ +#define SHA1_STEP_S(f,a,b,c,d,e,x) \ +{ \ + e = __add3_S (e, x, f (b, c, d)); \ + e = __add3_S (e, K, rotl32_S (a, 5u)); \ + b = rotl32_S (b, 30u); \ } -#define SHA1_STEP(f,a,b,c,d,e,x) \ -{ \ - e += K; \ - e = __add3 (e, x, f (b, c, d)); \ - e += rotl32 (a, 5u); \ - b = rotl32 (b, 30u); \ +#define SHA1_STEP(f,a,b,c,d,e,x) \ +{ \ + e = __add3 (e, x, f (b, c, d)); \ + e = __add3 (e, K, rotl32 (a, 5u)); \ + b = rotl32 (b, 30u); \ } #define SHA1_STEP0(f,a,b,c,d,e,x) \ @@ -160,19 +158,6 @@ b = rotl32 (b, 30u); \ } -#define SHA1_STEP_PE(f,a,b,c,d,e,x) \ -{ \ - e += x; \ - e += f (b, c, d); \ - e += rotl32 (a, 5u); \ -} - -#define SHA1_STEP_PB(f,a,b,c,d,e,x) \ -{ \ - e += K; \ - b = rotl32 (b, 30u); \ -} - #define SHIFT_RIGHT_32(x,n) ((x) >> (n)) #define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u)) diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 51c024aea..d6770540c 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -176,16 +176,43 @@ static u64x hl32_to_64 (const u32x a, const u32x b) #ifdef IS_AMD static u32 swap32_S (const u32 v) { - return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); + #ifdef IS_AMD_ROCM + + u32 t; + + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203)); + + return t; + + #else + + return as_uint (as_uchar4 (v).s3210); + + #endif } static u64 swap64_S (const u64 v) { - return bitselect (bitselect (rotate (v, 24ul), - rotate (v, 8ul), 0x000000ff000000fful), - bitselect (rotate (v, 56ul), - rotate (v, 40ul), 0x00ff000000ff0000ul), - 0xffff0000ffff0000ul); + #ifdef IS_AMD_ROCM + + const u32 v0 = h32_from_64_S (v); + const u32 v1 = l32_from_64_S (v); + + u32 t0; + u32 t1; + + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t0) : "v"(v0), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t1) : "v"(v1), "v"(0x00010203)); + + const u64 r = hl32_to_64_S (t1, t0); + + return r; + + #else + + return (as_ulong (as_uchar8 (v).s76543210)); + + #endif } static u32 rotr32_S (const u32 a, const u32 n) @@ -218,16 +245,122 @@ static u64 rotl64_S (const u64 a, const u32 n) static u32x swap32 (const u32x v) { + #ifdef IS_AMD_ROCM + + u32x t; + + #if VECT_SIZE == 1 + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 2 + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s0) : "v"(v.s0), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s1) : "v"(v.s1), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 4 + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s2) : "v"(v.s2), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s3) : "v"(v.s3), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 8 + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s4) : "v"(v.s4), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s5) : "v"(v.s5), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s6) : "v"(v.s6), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s7) : "v"(v.s7), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 16 + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s8) : "v"(v.s8), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s9) : "v"(v.s9), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sa) : "v"(v.sa), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sb) : "v"(v.sb), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sc) : "v"(v.sc), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sd) : "v"(v.sd), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.se) : "v"(v.se), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sf) : "v"(v.sf), "v"(0x00010203)); + #endif + + return t; + + #else + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); + + #endif } static u64x swap64 (const u64x v) { + #ifdef IS_AMD_ROCM + + const u32x a0 = h32_from_64 (v); + const u32x a1 = l32_from_64 (v); + + u32x t0; + u32x t1; + + #if VECT_SIZE == 1 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0) : "v"(0), "v"(a0), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1) : "v"(0), "v"(a1), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 2 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s0) : "v"(0), "v"(a0.s0), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s0) : "v"(0), "v"(a1.s0), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s1) : "v"(0), "v"(a0.s1), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s1) : "v"(0), "v"(a1.s1), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 4 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s2) : "v"(0), "v"(a0.s2), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s2) : "v"(0), "v"(a1.s2), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s3) : "v"(0), "v"(a0.s3), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s3) : "v"(0), "v"(a1.s3), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 8 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s4) : "v"(0), "v"(a0.s4), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s4) : "v"(0), "v"(a1.s4), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s5) : "v"(0), "v"(a0.s5), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s5) : "v"(0), "v"(a1.s5), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s6) : "v"(0), "v"(a0.s6), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s6) : "v"(0), "v"(a1.s6), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s7) : "v"(0), "v"(a0.s7), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s7) : "v"(0), "v"(a1.s7), "v"(0x00010203)); + #endif + + #if VECT_SIZE >= 16 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s8) : "v"(0), "v"(a0.s8), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s8) : "v"(0), "v"(a1.s8), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s9) : "v"(0), "v"(a0.s9), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s9) : "v"(0), "v"(a1.s9), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sa) : "v"(0), "v"(a0.sa), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sa) : "v"(0), "v"(a1.sa), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sb) : "v"(0), "v"(a0.sb), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sb) : "v"(0), "v"(a1.sb), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sc) : "v"(0), "v"(a0.sc), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sc) : "v"(0), "v"(a1.sc), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sd) : "v"(0), "v"(a0.sd), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sd) : "v"(0), "v"(a1.sd), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.se) : "v"(0), "v"(a0.se), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.se) : "v"(0), "v"(a1.se), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sf) : "v"(0), "v"(a0.sf), "v"(0x00010203)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sf) : "v"(0), "v"(a1.sf), "v"(0x00010203)); + #endif + + const u64x r = hl32_to_64 (t1, t0); + + return r; + + #else + return bitselect (bitselect (rotate (v, 24ul), rotate (v, 8ul), 0x000000ff000000fful), bitselect (rotate (v, 56ul), rotate (v, 40ul), 0x00ff000000ff0000ul), 0xffff0000ffff0000ul); + #endif } static u32x rotr32 (const u32x a, const u32 n) diff --git a/OpenCL/m00100_a3-optimized.cl b/OpenCL/m00100_a3-optimized.cl index d988851f3..a22e98b13 100644 --- a/OpenCL/m00100_a3-optimized.cl +++ b/OpenCL/m00100_a3-optimized.cl @@ -362,7 +362,7 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -499,13 +499,10 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); - - SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); + SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); if (MATCHES_NONE_VS (e, e_rev)) continue; - SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); diff --git a/OpenCL/m00110_a3-optimized.cl b/OpenCL/m00110_a3-optimized.cl index 98f12c0e7..ee5bd4fac 100644 --- a/OpenCL/m00110_a3-optimized.cl +++ b/OpenCL/m00110_a3-optimized.cl @@ -410,7 +410,7 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -547,13 +547,10 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); - - SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); + SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); if (MATCHES_NONE_VS (e, e_rev)) continue; - SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); diff --git a/OpenCL/m00130_a3-optimized.cl b/OpenCL/m00130_a3-optimized.cl index b4f0d8a72..3cc025e82 100644 --- a/OpenCL/m00130_a3-optimized.cl +++ b/OpenCL/m00130_a3-optimized.cl @@ -410,7 +410,7 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke * reverse */ - const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -547,13 +547,10 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); - - SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); + SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); if (MATCHES_NONE_VS (e, e_rev)) continue; - SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);