Optimize BLAKE2B_ROUND() 64 bit rotates

2025-02-18 18:42:19 +00:00 · 2021-07-17 18:18:22 +02:00 · 2021-07-17 18:18:22 +02:00 · bd92589af1
commit bd92589af1
parent 3becb253d3
2 changed files with 216 additions and 6 deletions
--- a/OpenCL/inc_hash_blake2b.cl
+++ b/OpenCL/inc_hash_blake2b.cl
@ -9,6 +9,207 @@
 #include "inc_common.h"
 #include "inc_hash_blake2b.h"

+DECLSPEC u64 blake2b_rot16_S (const u64 a)
+{
+  #if defined IS_NV
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x1076);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x5432);
+
+  return out.v64;
+
+  #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x01000706);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x05040302);
+
+  return out.v64;
+
+  #else
+
+  return hc_rotr64_S (a, 16);
+
+  #endif
+}
+
+DECLSPEC u64x blake2b_rot16 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot16_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot16_S (a.s0);
+  r.s1 = blake2b_rot16_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot16_S (a.s2);
+  r.s3 = blake2b_rot16_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot16_S (a.s4);
+  r.s5 = blake2b_rot16_S (a.s5);
+  r.s6 = blake2b_rot16_S (a.s6);
+  r.s7 = blake2b_rot16_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot16_S (a.s8);
+  r.s9 = blake2b_rot16_S (a.s9);
+  r.sa = blake2b_rot16_S (a.sa);
+  r.sb = blake2b_rot16_S (a.sb);
+  r.sc = blake2b_rot16_S (a.sc);
+  r.sd = blake2b_rot16_S (a.sd);
+  r.se = blake2b_rot16_S (a.se);
+  r.sf = blake2b_rot16_S (a.sf);
+  #endif
+
+  return r;
+}
+
+DECLSPEC u64 blake2b_rot24_S (const u64 a)
+{
+  #if defined IS_NV
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x2107);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x6543);
+
+  return out.v64;
+
+  #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
+
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x02010007);
+  out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x06050403);
+
+  return out.v64;
+
+  #else
+
+  return hc_rotr64_S (a, 16);
+
+  #endif
+}
+
+DECLSPEC u64x blake2b_rot24 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot24_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot24_S (a.s0);
+  r.s1 = blake2b_rot24_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot24_S (a.s2);
+  r.s3 = blake2b_rot24_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot24_S (a.s4);
+  r.s5 = blake2b_rot24_S (a.s5);
+  r.s6 = blake2b_rot24_S (a.s6);
+  r.s7 = blake2b_rot24_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot24_S (a.s8);
+  r.s9 = blake2b_rot24_S (a.s9);
+  r.sa = blake2b_rot24_S (a.sa);
+  r.sb = blake2b_rot24_S (a.sb);
+  r.sc = blake2b_rot24_S (a.sc);
+  r.sd = blake2b_rot24_S (a.sd);
+  r.se = blake2b_rot24_S (a.se);
+  r.sf = blake2b_rot24_S (a.sf);
+  #endif
+
+  return r;
+}
+
+DECLSPEC u64 blake2b_rot32_S (const u64 a)
+{
+  vconv64_t in;
+
+  in.v64 = a;
+
+  vconv64_t out;
+
+  out.v32.a = in.v32.b;
+  out.v32.b = in.v32.a;
+
+  return out.v64;
+}
+
+DECLSPEC u64x blake2b_rot32 (const u64x a)
+{
+  u64x r;
+
+  #if VECT_SIZE == 1
+  r = blake2b_rot32_S (a);
+  #endif
+
+  #if VECT_SIZE >= 2
+  r.s0 = blake2b_rot32_S (a.s0);
+  r.s1 = blake2b_rot32_S (a.s1);
+  #endif
+
+  #if VECT_SIZE >= 4
+  r.s2 = blake2b_rot32_S (a.s2);
+  r.s3 = blake2b_rot32_S (a.s3);
+  #endif
+
+  #if VECT_SIZE >= 8
+  r.s4 = blake2b_rot32_S (a.s4);
+  r.s5 = blake2b_rot32_S (a.s5);
+  r.s6 = blake2b_rot32_S (a.s6);
+  r.s7 = blake2b_rot32_S (a.s7);
+  #endif
+
+  #if VECT_SIZE >= 16
+  r.s8 = blake2b_rot32_S (a.s8);
+  r.s9 = blake2b_rot32_S (a.s9);
+  r.sa = blake2b_rot32_S (a.sa);
+  r.sb = blake2b_rot32_S (a.sb);
+  r.sc = blake2b_rot32_S (a.sc);
+  r.sd = blake2b_rot32_S (a.sd);
+  r.se = blake2b_rot32_S (a.se);
+  r.sf = blake2b_rot32_S (a.sf);
+  #endif
+
+  return r;
+}
+
 DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const int len, const u64 f0)
 {
  const u64 t0 = hl32_to_64_S (0, len);
--- a/OpenCL/inc_hash_blake2b.h
+++ b/OpenCL/inc_hash_blake2b.h
@ -9,14 +9,23 @@
 #define BLAKE2B_UPDATE  0
 #define BLAKE2B_FINAL  -1

+DECLSPEC u64  blake2b_rot16_S (const u64  a);
+DECLSPEC u64x blake2b_rot16   (const u64x a);
+
+DECLSPEC u64  blake2b_rot24_S (const u64  a);
+DECLSPEC u64x blake2b_rot24   (const u64x a);
+
+DECLSPEC u64  blake2b_rot32_S (const u64  a);
+DECLSPEC u64x blake2b_rot32   (const u64x a);
+
 #define BLAKE2B_G(k0,k1,a,b,c,d) \
 {                                \
  a = a + b + m[k0];             \
-  d = hc_rotr64_S (d ^ a, 32);   \
+  d = blake2b_rot32_S (d ^ a);   \
  c = c + d;                     \
-  b = hc_rotr64_S (b ^ c, 24);   \
+  b = blake2b_rot24_S (b ^ c);   \
  a = a + b + m[k1];             \
-  d = hc_rotr64_S (d ^ a, 16);   \
+  d = blake2b_rot16_S (d ^ a);   \
  c = c + d;                     \
  b = hc_rotr64_S (b ^ c, 63);   \
 }
@ -36,11 +45,11 @@
 #define BLAKE2B_G_VECTOR(k0,k1,a,b,c,d) \
 {                                       \
  a = a + b + m[k0];                    \
-  d = hc_rotr64 (d ^ a, 32);            \
+  d = blake2b_rot32 (d ^ a);            \
  c = c + d;                            \
-  b = hc_rotr64 (b ^ c, 24);            \
+  b = blake2b_rot24 (b ^ c);            \
  a = a + b + m[k1];                    \
-  d = hc_rotr64 (d ^ a, 16);            \
+  d = blake2b_rot16 (d ^ a);            \
  c = c + d;                            \
  b = hc_rotr64 (b ^ c, 63);            \
 }