1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-05-16 13:58:54 +00:00

More VEGA specific inline assembly to improve SHA1 based kernels

This commit is contained in:
jsteube 2017-08-28 09:24:06 +02:00
parent a0be36d7b8
commit 9de1e557bb
5 changed files with 155 additions and 46 deletions

View File

@ -130,20 +130,18 @@
#define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z))) #define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
#endif #endif
#define SHA1_STEP_S(f,a,b,c,d,e,x) \ #define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \ { \
e += K; \ e = __add3_S (e, x, f (b, c, d)); \
e = __add3_S (e, x, f (b, c, d)); \ e = __add3_S (e, K, rotl32_S (a, 5u)); \
e += rotl32_S (a, 5u); \ b = rotl32_S (b, 30u); \
b = rotl32_S (b, 30u); \
} }
#define SHA1_STEP(f,a,b,c,d,e,x) \ #define SHA1_STEP(f,a,b,c,d,e,x) \
{ \ { \
e += K; \ e = __add3 (e, x, f (b, c, d)); \
e = __add3 (e, x, f (b, c, d)); \ e = __add3 (e, K, rotl32 (a, 5u)); \
e += rotl32 (a, 5u); \ b = rotl32 (b, 30u); \
b = rotl32 (b, 30u); \
} }
#define SHA1_STEP0(f,a,b,c,d,e,x) \ #define SHA1_STEP0(f,a,b,c,d,e,x) \
@ -160,19 +158,6 @@
b = rotl32 (b, 30u); \ b = rotl32 (b, 30u); \
} }
#define SHA1_STEP_PE(f,a,b,c,d,e,x) \
{ \
e += x; \
e += f (b, c, d); \
e += rotl32 (a, 5u); \
}
#define SHA1_STEP_PB(f,a,b,c,d,e,x) \
{ \
e += K; \
b = rotl32 (b, 30u); \
}
#define SHIFT_RIGHT_32(x,n) ((x) >> (n)) #define SHIFT_RIGHT_32(x,n) ((x) >> (n))
#define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u)) #define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u))

View File

@ -176,16 +176,43 @@ static u64x hl32_to_64 (const u32x a, const u32x b)
#ifdef IS_AMD #ifdef IS_AMD
static u32 swap32_S (const u32 v) static u32 swap32_S (const u32 v)
{ {
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); #ifdef IS_AMD_ROCM
u32 t;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
return t;
#else
return as_uint (as_uchar4 (v).s3210);
#endif
} }
static u64 swap64_S (const u64 v) static u64 swap64_S (const u64 v)
{ {
return bitselect (bitselect (rotate (v, 24ul), #ifdef IS_AMD_ROCM
rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul), const u32 v0 = h32_from_64_S (v);
rotate (v, 40ul), 0x00ff000000ff0000ul), const u32 v1 = l32_from_64_S (v);
0xffff0000ffff0000ul);
u32 t0;
u32 t1;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t0) : "v"(v0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t1) : "v"(v1), "v"(0x00010203));
const u64 r = hl32_to_64_S (t1, t0);
return r;
#else
return (as_ulong (as_uchar8 (v).s76543210));
#endif
} }
static u32 rotr32_S (const u32 a, const u32 n) static u32 rotr32_S (const u32 a, const u32 n)
@ -218,16 +245,122 @@ static u64 rotl64_S (const u64 a, const u32 n)
static u32x swap32 (const u32x v) static u32x swap32 (const u32x v)
{ {
#ifdef IS_AMD_ROCM
u32x t;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s0) : "v"(v.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s1) : "v"(v.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s2) : "v"(v.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s3) : "v"(v.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s4) : "v"(v.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s5) : "v"(v.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s6) : "v"(v.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s7) : "v"(v.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s8) : "v"(v.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s9) : "v"(v.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sa) : "v"(v.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sb) : "v"(v.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sc) : "v"(v.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sd) : "v"(v.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.se) : "v"(v.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sf) : "v"(v.sf), "v"(0x00010203));
#endif
return t;
#else
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
#endif
} }
static u64x swap64 (const u64x v) static u64x swap64 (const u64x v)
{ {
#ifdef IS_AMD_ROCM
const u32x a0 = h32_from_64 (v);
const u32x a1 = l32_from_64 (v);
u32x t0;
u32x t1;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0) : "v"(0), "v"(a0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1) : "v"(0), "v"(a1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s0) : "v"(0), "v"(a0.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s0) : "v"(0), "v"(a1.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s1) : "v"(0), "v"(a0.s1), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s1) : "v"(0), "v"(a1.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s2) : "v"(0), "v"(a0.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s2) : "v"(0), "v"(a1.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s3) : "v"(0), "v"(a0.s3), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s3) : "v"(0), "v"(a1.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s4) : "v"(0), "v"(a0.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s4) : "v"(0), "v"(a1.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s5) : "v"(0), "v"(a0.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s5) : "v"(0), "v"(a1.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s6) : "v"(0), "v"(a0.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s6) : "v"(0), "v"(a1.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s7) : "v"(0), "v"(a0.s7), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s7) : "v"(0), "v"(a1.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s8) : "v"(0), "v"(a0.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s8) : "v"(0), "v"(a1.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s9) : "v"(0), "v"(a0.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s9) : "v"(0), "v"(a1.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sa) : "v"(0), "v"(a0.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sa) : "v"(0), "v"(a1.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sb) : "v"(0), "v"(a0.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sb) : "v"(0), "v"(a1.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sc) : "v"(0), "v"(a0.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sc) : "v"(0), "v"(a1.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sd) : "v"(0), "v"(a0.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sd) : "v"(0), "v"(a1.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.se) : "v"(0), "v"(a0.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.se) : "v"(0), "v"(a1.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sf) : "v"(0), "v"(a0.sf), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sf) : "v"(0), "v"(a1.sf), "v"(0x00010203));
#endif
const u64x r = hl32_to_64 (t1, t0);
return r;
#else
return bitselect (bitselect (rotate (v, 24ul), return bitselect (bitselect (rotate (v, 24ul),
rotate (v, 8ul), 0x000000ff000000fful), rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul), bitselect (rotate (v, 56ul),
rotate (v, 40ul), 0x00ff000000ff0000ul), rotate (v, 40ul), 0x00ff000000ff0000ul),
0xffff0000ffff0000ul); 0xffff0000ffff0000ul);
#endif
} }
static u32x rotr32 (const u32x a, const u32 n) static u32x rotr32 (const u32x a, const u32 n)

View File

@ -362,7 +362,7 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse * reverse
*/ */
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
@ -499,13 +499,10 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);

View File

@ -410,7 +410,7 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse * reverse
*/ */
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
@ -547,13 +547,10 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);

View File

@ -410,7 +410,7 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse * reverse
*/ */
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
@ -547,13 +547,10 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);