Add VEGA specific inline assembly to improve all MD4, MD5, SHA1 and SHA256 based kernels

pull/1373/head
jsteube 7 years ago
parent 7cebc936fd
commit 00e38cc2c6

@ -34,23 +34,23 @@
#define MD4_STEP_S(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a += x; \
a += f (b, c, d); \
const u32 t = f (b, c, d); \
a = __add3_S (a, x, t); \
a = rotl32_S (a, s); \
}
#define MD4_STEP(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a += x; \
a += f (b, c, d); \
const u32x t = f (b, c, d); \
a = __add3 (a, x, t); \
a = rotl32 (a, s); \
}
#define MD4_STEP0(f,a,b,c,d,K,s) \
{ \
a += K; \
a += f (b, c, d); \
const u32x t = f (b, c, d); \
a = __add3 (a, K, t); \
a = rotl32 (a, s); \
}
@ -72,7 +72,7 @@
#define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
#define MD5_H(x,y,z) ((x) ^ (y) ^ (z))
#define MD5_I(x,y,z) (bitselect (0xffffffffU, (x), (z)) ^ (y))
#define MD5_I(x,y,z) ((y) ^ ((x) | ~(z)))
#define MD5_Fo(x,y,z) (bitselect ((z), (y), (x)))
#define MD5_Go(x,y,z) (bitselect ((y), (x), (z)))
#endif
@ -89,8 +89,8 @@
#define MD5_STEP_S(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a += x; \
a += f (b, c, d); \
const u32 t = f (b, c, d); \
a = __add3_S (a, x, t); \
a = rotl32_S (a, s); \
a += b; \
}
@ -98,16 +98,16 @@
#define MD5_STEP(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a += x; \
a += f (b, c, d); \
const u32x t = f (b, c, d); \
a = __add3 (a, x, t); \
a = rotl32 (a, s); \
a += b; \
}
#define MD5_STEP0(f,a,b,c,d,K,s) \
{ \
a += K; \
a += f (b, c, d); \
const u32x t = f (b, c, d); \
a = __add3 (a, K, t); \
a = rotl32 (a, s); \
a += b; \
}
@ -139,8 +139,8 @@
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \
e += K; \
e += x; \
e += f (b, c, d); \
const u32 t = f (b, c, d); \
e = __add3_S (e, x, t); \
e += rotl32_S (a, 5u); \
b = rotl32_S (b, 30u); \
}
@ -148,24 +148,24 @@
#define SHA1_STEP(f,a,b,c,d,e,x) \
{ \
e += K; \
e += x; \
e += f (b, c, d); \
const u32x t = f (b, c, d); \
e = __add3 (e, x, t); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
#define SHA1_STEP0(f,a,b,c,d,e,x) \
{ \
e += K; \
e += f (b, c, d); \
const u32x t = f (b, c, d); \
e = __add3 (e, K, t); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
#define SHA1_STEPX(f,a,b,c,d,e,x) \
{ \
e += x; \
e += f (b, c, d); \
const u32x t = f (b, c, d); \
e = __add3 (e, x, t); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
@ -218,26 +218,28 @@
#define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
h += K; \
h += x; \
h += SHA256_S3_S (e); \
h += F1 (e,f,g); \
const u32 t1 = SHA256_S3_S (e); \
const u32 t2 = F1 (e,f,g); \
h = __add3_S (h, K, x); \
h = __add3_S (h, t1, t2); \
d += h; \
h += SHA256_S2_S (a); \
h += F0 (a,b,c); \
const u32 t3 = SHA256_S2_S (a); \
const u32 t4 = F0 (a,b,c); \
h = __add3_S (h, t3, t4); \
}
#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w)
#define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
h += K; \
h += x; \
h += SHA256_S3 (e); \
h += F1 (e,f,g); \
const u32 t1 = SHA256_S3 (e); \
const u32 t2 = F1 (e,f,g); \
h = __add3 (h, K, x); \
h = __add3 (h, t1, t2); \
d += h; \
h += SHA256_S2 (a); \
h += F0 (a,b,c); \
const u32 t3 = SHA256_S2 (a); \
const u32 t4 = F0 (a,b,c); \
h = __add3 (h, t3, t4); \
}
#define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w)

@ -339,6 +339,82 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
}
#endif
#ifdef IS_AMD_ROCM_VEGA
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
u32x r;
#if VECT_SIZE == 1
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se));
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf));
#endif
return r;
}
#else
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
#endif
#ifdef IS_AMD_ROCM_VEGA
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
__asm__ volatile ("V_ADD3_U32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
return r;
}
#else
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}
#endif
#endif
#ifdef IS_NV
@ -571,6 +647,17 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
return r;
}
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}
#endif
#ifdef IS_GENERIC
@ -710,6 +797,16 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
return (u32) (tmp);
}
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}
#endif
typedef struct digest

@ -36,6 +36,9 @@
#else
#define IS_AMD
#define IS_AMD_ROCM
#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__
#define IS_AMD_ROCM_VEGA
#endif
#endif
#elif VENDOR_ID == (1 << 1)
#define IS_APPLE

Loading…
Cancel
Save