More VEGA specific inline assembly to improve SHA1 based kernels

pull/1373/head
jsteube 7 years ago
parent a0be36d7b8
commit 9de1e557bb

@ -130,20 +130,18 @@
#define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
#endif
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \
e += K; \
e = __add3_S (e, x, f (b, c, d)); \
e += rotl32_S (a, 5u); \
b = rotl32_S (b, 30u); \
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \
e = __add3_S (e, x, f (b, c, d)); \
e = __add3_S (e, K, rotl32_S (a, 5u)); \
b = rotl32_S (b, 30u); \
}
#define SHA1_STEP(f,a,b,c,d,e,x) \
{ \
e += K; \
e = __add3 (e, x, f (b, c, d)); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
#define SHA1_STEP(f,a,b,c,d,e,x) \
{ \
e = __add3 (e, x, f (b, c, d)); \
e = __add3 (e, K, rotl32 (a, 5u)); \
b = rotl32 (b, 30u); \
}
#define SHA1_STEP0(f,a,b,c,d,e,x) \
@ -160,19 +158,6 @@
b = rotl32 (b, 30u); \
}
#define SHA1_STEP_PE(f,a,b,c,d,e,x) \
{ \
e += x; \
e += f (b, c, d); \
e += rotl32 (a, 5u); \
}
#define SHA1_STEP_PB(f,a,b,c,d,e,x) \
{ \
e += K; \
b = rotl32 (b, 30u); \
}
#define SHIFT_RIGHT_32(x,n) ((x) >> (n))
#define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u))

@ -176,16 +176,43 @@ static u64x hl32_to_64 (const u32x a, const u32x b)
#ifdef IS_AMD
static u32 swap32_S (const u32 v)
{
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
#ifdef IS_AMD_ROCM
u32 t;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
return t;
#else
return as_uint (as_uchar4 (v).s3210);
#endif
}
static u64 swap64_S (const u64 v)
{
return bitselect (bitselect (rotate (v, 24ul),
rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul),
rotate (v, 40ul), 0x00ff000000ff0000ul),
0xffff0000ffff0000ul);
#ifdef IS_AMD_ROCM
const u32 v0 = h32_from_64_S (v);
const u32 v1 = l32_from_64_S (v);
u32 t0;
u32 t1;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t0) : "v"(v0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t1) : "v"(v1), "v"(0x00010203));
const u64 r = hl32_to_64_S (t1, t0);
return r;
#else
return (as_ulong (as_uchar8 (v).s76543210));
#endif
}
static u32 rotr32_S (const u32 a, const u32 n)
@ -218,16 +245,122 @@ static u64 rotl64_S (const u64 a, const u32 n)
static u32x swap32 (const u32x v)
{
#ifdef IS_AMD_ROCM
u32x t;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s0) : "v"(v.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s1) : "v"(v.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s2) : "v"(v.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s3) : "v"(v.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s4) : "v"(v.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s5) : "v"(v.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s6) : "v"(v.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s7) : "v"(v.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s8) : "v"(v.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s9) : "v"(v.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sa) : "v"(v.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sb) : "v"(v.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sc) : "v"(v.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sd) : "v"(v.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.se) : "v"(v.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sf) : "v"(v.sf), "v"(0x00010203));
#endif
return t;
#else
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
#endif
}
static u64x swap64 (const u64x v)
{
#ifdef IS_AMD_ROCM
const u32x a0 = h32_from_64 (v);
const u32x a1 = l32_from_64 (v);
u32x t0;
u32x t1;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0) : "v"(0), "v"(a0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1) : "v"(0), "v"(a1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s0) : "v"(0), "v"(a0.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s0) : "v"(0), "v"(a1.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s1) : "v"(0), "v"(a0.s1), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s1) : "v"(0), "v"(a1.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s2) : "v"(0), "v"(a0.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s2) : "v"(0), "v"(a1.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s3) : "v"(0), "v"(a0.s3), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s3) : "v"(0), "v"(a1.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s4) : "v"(0), "v"(a0.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s4) : "v"(0), "v"(a1.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s5) : "v"(0), "v"(a0.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s5) : "v"(0), "v"(a1.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s6) : "v"(0), "v"(a0.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s6) : "v"(0), "v"(a1.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s7) : "v"(0), "v"(a0.s7), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s7) : "v"(0), "v"(a1.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s8) : "v"(0), "v"(a0.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s8) : "v"(0), "v"(a1.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s9) : "v"(0), "v"(a0.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s9) : "v"(0), "v"(a1.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sa) : "v"(0), "v"(a0.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sa) : "v"(0), "v"(a1.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sb) : "v"(0), "v"(a0.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sb) : "v"(0), "v"(a1.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sc) : "v"(0), "v"(a0.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sc) : "v"(0), "v"(a1.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sd) : "v"(0), "v"(a0.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sd) : "v"(0), "v"(a1.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.se) : "v"(0), "v"(a0.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.se) : "v"(0), "v"(a1.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sf) : "v"(0), "v"(a0.sf), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sf) : "v"(0), "v"(a1.sf), "v"(0x00010203));
#endif
const u64x r = hl32_to_64 (t1, t0);
return r;
#else
return bitselect (bitselect (rotate (v, 24ul),
rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul),
rotate (v, 40ul), 0x00ff000000ff0000ul),
0xffff0000ffff0000ul);
#endif
}
static u32x rotr32 (const u32x a, const u32 n)

@ -362,7 +362,7 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse
*/
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
@ -499,13 +499,10 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);

@ -410,7 +410,7 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse
*/
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
@ -547,13 +547,10 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);

@ -410,7 +410,7 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
* reverse
*/
const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
const u32 e_rev = rotl32_S (search[1], 2u);
/**
* loop
@ -547,13 +547,10 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
if (MATCHES_NONE_VS (e, e_rev)) continue;
SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);

Loading…
Cancel
Save