mirror of
https://github.com/hashcat/hashcat.git
synced 2025-01-22 21:51:07 +00:00
Add VEGA specific inline assembly to improve all MD4, MD5, SHA1 and SHA256 based kernels
This commit is contained in:
parent
7cebc936fd
commit
00e38cc2c6
@ -34,23 +34,23 @@
|
||||
#define MD4_STEP_S(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += x; \
|
||||
a += f (b, c, d); \
|
||||
const u32 t = f (b, c, d); \
|
||||
a = __add3_S (a, x, t); \
|
||||
a = rotl32_S (a, s); \
|
||||
}
|
||||
|
||||
#define MD4_STEP(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += x; \
|
||||
a += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
a = __add3 (a, x, t); \
|
||||
a = rotl32 (a, s); \
|
||||
}
|
||||
|
||||
#define MD4_STEP0(f,a,b,c,d,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
a = __add3 (a, K, t); \
|
||||
a = rotl32 (a, s); \
|
||||
}
|
||||
|
||||
@ -72,7 +72,7 @@
|
||||
#define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
|
||||
#define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
|
||||
#define MD5_H(x,y,z) ((x) ^ (y) ^ (z))
|
||||
#define MD5_I(x,y,z) (bitselect (0xffffffffU, (x), (z)) ^ (y))
|
||||
#define MD5_I(x,y,z) ((y) ^ ((x) | ~(z)))
|
||||
#define MD5_Fo(x,y,z) (bitselect ((z), (y), (x)))
|
||||
#define MD5_Go(x,y,z) (bitselect ((y), (x), (z)))
|
||||
#endif
|
||||
@ -89,8 +89,8 @@
|
||||
#define MD5_STEP_S(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += x; \
|
||||
a += f (b, c, d); \
|
||||
const u32 t = f (b, c, d); \
|
||||
a = __add3_S (a, x, t); \
|
||||
a = rotl32_S (a, s); \
|
||||
a += b; \
|
||||
}
|
||||
@ -98,16 +98,16 @@
|
||||
#define MD5_STEP(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += x; \
|
||||
a += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
a = __add3 (a, x, t); \
|
||||
a = rotl32 (a, s); \
|
||||
a += b; \
|
||||
}
|
||||
|
||||
#define MD5_STEP0(f,a,b,c,d,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
a = __add3 (a, K, t); \
|
||||
a = rotl32 (a, s); \
|
||||
a += b; \
|
||||
}
|
||||
@ -139,8 +139,8 @@
|
||||
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
|
||||
{ \
|
||||
e += K; \
|
||||
e += x; \
|
||||
e += f (b, c, d); \
|
||||
const u32 t = f (b, c, d); \
|
||||
e = __add3_S (e, x, t); \
|
||||
e += rotl32_S (a, 5u); \
|
||||
b = rotl32_S (b, 30u); \
|
||||
}
|
||||
@ -148,24 +148,24 @@
|
||||
#define SHA1_STEP(f,a,b,c,d,e,x) \
|
||||
{ \
|
||||
e += K; \
|
||||
e += x; \
|
||||
e += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
e = __add3 (e, x, t); \
|
||||
e += rotl32 (a, 5u); \
|
||||
b = rotl32 (b, 30u); \
|
||||
}
|
||||
|
||||
#define SHA1_STEP0(f,a,b,c,d,e,x) \
|
||||
{ \
|
||||
e += K; \
|
||||
e += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
e = __add3 (e, K, t); \
|
||||
e += rotl32 (a, 5u); \
|
||||
b = rotl32 (b, 30u); \
|
||||
}
|
||||
|
||||
#define SHA1_STEPX(f,a,b,c,d,e,x) \
|
||||
{ \
|
||||
e += x; \
|
||||
e += f (b, c, d); \
|
||||
const u32x t = f (b, c, d); \
|
||||
e = __add3 (e, x, t); \
|
||||
e += rotl32 (a, 5u); \
|
||||
b = rotl32 (b, 30u); \
|
||||
}
|
||||
@ -218,26 +218,28 @@
|
||||
|
||||
#define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \
|
||||
{ \
|
||||
h += K; \
|
||||
h += x; \
|
||||
h += SHA256_S3_S (e); \
|
||||
h += F1 (e,f,g); \
|
||||
const u32 t1 = SHA256_S3_S (e); \
|
||||
const u32 t2 = F1 (e,f,g); \
|
||||
h = __add3_S (h, K, x); \
|
||||
h = __add3_S (h, t1, t2); \
|
||||
d += h; \
|
||||
h += SHA256_S2_S (a); \
|
||||
h += F0 (a,b,c); \
|
||||
const u32 t3 = SHA256_S2_S (a); \
|
||||
const u32 t4 = F0 (a,b,c); \
|
||||
h = __add3_S (h, t3, t4); \
|
||||
}
|
||||
|
||||
#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w)
|
||||
|
||||
#define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
|
||||
{ \
|
||||
h += K; \
|
||||
h += x; \
|
||||
h += SHA256_S3 (e); \
|
||||
h += F1 (e,f,g); \
|
||||
const u32 t1 = SHA256_S3 (e); \
|
||||
const u32 t2 = F1 (e,f,g); \
|
||||
h = __add3 (h, K, x); \
|
||||
h = __add3 (h, t1, t2); \
|
||||
d += h; \
|
||||
h += SHA256_S2 (a); \
|
||||
h += F0 (a,b,c); \
|
||||
const u32 t3 = SHA256_S2 (a); \
|
||||
const u32 t4 = F0 (a,b,c); \
|
||||
h = __add3 (h, t3, t4); \
|
||||
}
|
||||
|
||||
#define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w)
|
||||
|
@ -339,6 +339,82 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IS_AMD_ROCM_VEGA
|
||||
static u32x __add3 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se));
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
#else
|
||||
static u32x __add3 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IS_AMD_ROCM_VEGA
|
||||
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
|
||||
|
||||
return r;
|
||||
}
|
||||
#else
|
||||
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef IS_NV
|
||||
@ -571,6 +647,17 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static u32x __add3 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
|
||||
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef IS_GENERIC
|
||||
@ -710,6 +797,16 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
|
||||
return (u32) (tmp);
|
||||
}
|
||||
|
||||
static u32x __add3 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
|
||||
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
return a + b + c;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct digest
|
||||
|
@ -36,6 +36,9 @@
|
||||
#else
|
||||
#define IS_AMD
|
||||
#define IS_AMD_ROCM
|
||||
#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__
|
||||
#define IS_AMD_ROCM_VEGA
|
||||
#endif
|
||||
#endif
|
||||
#elif VENDOR_ID == (1 << 1)
|
||||
#define IS_APPLE
|
||||
|
Loading…
Reference in New Issue
Block a user