More VEGA specific inline assembly to improve SHA1 based kernels

2025-07-22 22:48:47 +00:00 · 2017-08-28 09:24:06 +02:00 · 2017-08-28 09:24:06 +02:00 · 9de1e557bb
commit 9de1e557bb
parent a0be36d7b8
5 changed files with 155 additions and 46 deletions
--- a/OpenCL/inc_hash_functions.cl
+++ b/OpenCL/inc_hash_functions.cl
@ -130,20 +130,18 @@
 #define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
 #endif

-#define SHA1_STEP_S(f,a,b,c,d,e,x)    \
-{                                     \
-  e += K;                             \
-  e  = __add3_S (e, x, f (b, c, d));  \
-  e += rotl32_S (a,  5u);             \
-  b  = rotl32_S (b, 30u);             \
+#define SHA1_STEP_S(f,a,b,c,d,e,x)        \
+{                                         \
+  e  = __add3_S (e, x, f (b, c, d));      \
+  e  = __add3_S (e, K, rotl32_S (a, 5u)); \
+  b  = rotl32_S (b, 30u);                 \
 }

-#define SHA1_STEP(f,a,b,c,d,e,x)    \
-{                                   \
-  e += K;                           \
-  e  = __add3 (e, x, f (b, c, d));  \
-  e += rotl32 (a,  5u);             \
-  b  = rotl32 (b, 30u);             \
+#define SHA1_STEP(f,a,b,c,d,e,x)      \
+{                                     \
+  e  = __add3 (e, x, f (b, c, d));    \
+  e  = __add3 (e, K, rotl32 (a, 5u)); \
+  b  = rotl32 (b, 30u);               \
 }

 #define SHA1_STEP0(f,a,b,c,d,e,x)   \
@ -160,19 +158,6 @@
  b  = rotl32 (b, 30u);             \
 }

-#define SHA1_STEP_PE(f,a,b,c,d,e,x) \
-{                                   \
-  e += x;                           \
-  e += f (b, c, d);                 \
-  e += rotl32 (a,  5u);             \
-}
-
-#define SHA1_STEP_PB(f,a,b,c,d,e,x) \
-{                                   \
-  e += K;                           \
-  b  = rotl32 (b, 30u);             \
-}
-
 #define SHIFT_RIGHT_32(x,n) ((x) >> (n))

 #define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x),  3u))
--- a/OpenCL/inc_types.cl
+++ b/OpenCL/inc_types.cl
@ -176,16 +176,43 @@ static u64x hl32_to_64 (const u32x a, const u32x b)
 #ifdef IS_AMD
 static u32 swap32_S (const u32 v)
 {
-  return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
+  #ifdef IS_AMD_ROCM
+
+  u32 t;
+
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
+
+  return t;
+
+  #else
+
+  return as_uint (as_uchar4 (v).s3210);
+
+  #endif
 }

 static u64 swap64_S (const u64 v)
 {
-  return bitselect (bitselect (rotate (v, 24ul),
-                               rotate (v,  8ul), 0x000000ff000000fful),
-                    bitselect (rotate (v, 56ul),
-                               rotate (v, 40ul), 0x00ff000000ff0000ul),
-                                                 0xffff0000ffff0000ul);
+  #ifdef IS_AMD_ROCM
+
+  const u32 v0 = h32_from_64_S (v);
+  const u32 v1 = l32_from_64_S (v);
+
+  u32 t0;
+  u32 t1;
+
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t0) : "v"(v0), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t1) : "v"(v1), "v"(0x00010203));
+
+  const u64 r = hl32_to_64_S (t1, t0);
+
+  return r;
+
+  #else
+
+  return (as_ulong (as_uchar8 (v).s76543210));
+
+  #endif
 }

 static u32 rotr32_S (const u32 a, const u32 n)
@ -218,16 +245,122 @@ static u64 rotl64_S (const u64 a, const u32 n)

 static u32x swap32 (const u32x v)
 {
+  #ifdef IS_AMD_ROCM
+
+  u32x t;
+
+  #if VECT_SIZE == 1
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t) : "v"(v), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 2
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s0) : "v"(v.s0), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s1) : "v"(v.s1), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 4
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s2) : "v"(v.s2), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s3) : "v"(v.s3), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 8
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s4) : "v"(v.s4), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s5) : "v"(v.s5), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s6) : "v"(v.s6), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s7) : "v"(v.s7), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 16
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s8) : "v"(v.s8), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.s9) : "v"(v.s9), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sa) : "v"(v.sa), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sb) : "v"(v.sb), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sc) : "v"(v.sc), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sd) : "v"(v.sd), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.se) : "v"(v.se), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(t.sf) : "v"(v.sf), "v"(0x00010203));
+  #endif
+
+  return t;
+
+  #else
+
  return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
+
+  #endif
 }

 static u64x swap64 (const u64x v)
 {
+  #ifdef IS_AMD_ROCM
+
+  const u32x a0 = h32_from_64 (v);
+  const u32x a1 = l32_from_64 (v);
+
+  u32x t0;
+  u32x t1;
+
+  #if VECT_SIZE == 1
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0) : "v"(0), "v"(a0), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1) : "v"(0), "v"(a1), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 2
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s0) : "v"(0), "v"(a0.s0), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s0) : "v"(0), "v"(a1.s0), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s1) : "v"(0), "v"(a0.s1), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s1) : "v"(0), "v"(a1.s1), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 4
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s2) : "v"(0), "v"(a0.s2), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s2) : "v"(0), "v"(a1.s2), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s3) : "v"(0), "v"(a0.s3), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s3) : "v"(0), "v"(a1.s3), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 8
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s4) : "v"(0), "v"(a0.s4), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s4) : "v"(0), "v"(a1.s4), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s5) : "v"(0), "v"(a0.s5), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s5) : "v"(0), "v"(a1.s5), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s6) : "v"(0), "v"(a0.s6), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s6) : "v"(0), "v"(a1.s6), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s7) : "v"(0), "v"(a0.s7), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s7) : "v"(0), "v"(a1.s7), "v"(0x00010203));
+  #endif
+
+  #if VECT_SIZE >= 16
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s8) : "v"(0), "v"(a0.s8), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s8) : "v"(0), "v"(a1.s8), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.s9) : "v"(0), "v"(a0.s9), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.s9) : "v"(0), "v"(a1.s9), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sa) : "v"(0), "v"(a0.sa), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sa) : "v"(0), "v"(a1.sa), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sb) : "v"(0), "v"(a0.sb), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sb) : "v"(0), "v"(a1.sb), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sc) : "v"(0), "v"(a0.sc), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sc) : "v"(0), "v"(a1.sc), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sd) : "v"(0), "v"(a0.sd), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sd) : "v"(0), "v"(a1.sd), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.se) : "v"(0), "v"(a0.se), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.se) : "v"(0), "v"(a1.se), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t0.sf) : "v"(0), "v"(a0.sf), "v"(0x00010203));
+  __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(t1.sf) : "v"(0), "v"(a1.sf), "v"(0x00010203));
+  #endif
+
+  const u64x r = hl32_to_64 (t1, t0);
+
+  return r;
+
+  #else
+
  return bitselect (bitselect (rotate (v, 24ul),
                               rotate (v,  8ul), 0x000000ff000000fful),
                    bitselect (rotate (v, 56ul),
                               rotate (v, 40ul), 0x00ff000000ff0000ul),
                                                 0xffff0000ffff0000ul);
+  #endif
 }

 static u32x rotr32 (const u32x a, const u32 n)
--- a/OpenCL/m00100_a3-optimized.cl
+++ b/OpenCL/m00100_a3-optimized.cl
@ -362,7 +362,7 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
   * reverse
   */

-  const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
+  const u32 e_rev = rotl32_S (search[1], 2u);

 /**
   * loop
@ -499,13 +499,10 @@ void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
    SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
    SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
    SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
-
-    SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
+    SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));

    if (MATCHES_NONE_VS (e, e_rev)) continue;

-    SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
-
    const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
    const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
    const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
--- a/OpenCL/m00110_a3-optimized.cl
+++ b/OpenCL/m00110_a3-optimized.cl
@ -410,7 +410,7 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
   * reverse
   */

-  const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
+  const u32 e_rev = rotl32_S (search[1], 2u);

 /**
   * loop
@ -547,13 +547,10 @@ void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
    SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
    SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
    SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
-
-    SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
+    SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));

    if (MATCHES_NONE_VS (e, e_rev)) continue;

-    SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
-
    const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
    const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
    const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);
--- a/OpenCL/m00130_a3-optimized.cl
+++ b/OpenCL/m00130_a3-optimized.cl
@ -410,7 +410,7 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
   * reverse
   */

-  const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03;
+  const u32 e_rev = rotl32_S (search[1], 2u);

 /**
   * loop
@ -547,13 +547,10 @@ void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global const ke
    SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18));
    SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20));
    SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16));
-
-    SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));
+    SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14));

    if (MATCHES_NONE_VS (e, e_rev)) continue;

-    SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0);
-
    const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u);
    const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u);
    const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u);