From bd92589af1633565bcac89293f7265cd96df0f90 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 17 Jul 2021 18:18:22 +0200 Subject: [PATCH] Optimize BLAKE2B_ROUND() 64 bit rotates --- OpenCL/inc_hash_blake2b.cl | 201 +++++++++++++++++++++++++++++++++++++ OpenCL/inc_hash_blake2b.h | 21 ++-- 2 files changed, 216 insertions(+), 6 deletions(-) diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl index a90ee9e03..33467c933 100644 --- a/OpenCL/inc_hash_blake2b.cl +++ b/OpenCL/inc_hash_blake2b.cl @@ -9,6 +9,207 @@ #include "inc_common.h" #include "inc_hash_blake2b.h" +DECLSPEC u64 blake2b_rot16_S (const u64 a) +{ + #if defined IS_NV + + vconv64_t in; + + in.v64 = a; + + vconv64_t out; + + out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x1076); + out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x5432); + + return out.v64; + + #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM + + vconv64_t in; + + in.v64 = a; + + vconv64_t out; + + out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x01000706); + out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x05040302); + + return out.v64; + + #else + + return hc_rotr64_S (a, 16); + + #endif +} + +DECLSPEC u64x blake2b_rot16 (const u64x a) +{ + u64x r; + + #if VECT_SIZE == 1 + r = blake2b_rot16_S (a); + #endif + + #if VECT_SIZE >= 2 + r.s0 = blake2b_rot16_S (a.s0); + r.s1 = blake2b_rot16_S (a.s1); + #endif + + #if VECT_SIZE >= 4 + r.s2 = blake2b_rot16_S (a.s2); + r.s3 = blake2b_rot16_S (a.s3); + #endif + + #if VECT_SIZE >= 8 + r.s4 = blake2b_rot16_S (a.s4); + r.s5 = blake2b_rot16_S (a.s5); + r.s6 = blake2b_rot16_S (a.s6); + r.s7 = blake2b_rot16_S (a.s7); + #endif + + #if VECT_SIZE >= 16 + r.s8 = blake2b_rot16_S (a.s8); + r.s9 = blake2b_rot16_S (a.s9); + r.sa = blake2b_rot16_S (a.sa); + r.sb = blake2b_rot16_S (a.sb); + r.sc = blake2b_rot16_S (a.sc); + r.sd = blake2b_rot16_S (a.sd); + r.se = blake2b_rot16_S (a.se); + r.sf = blake2b_rot16_S (a.sf); + #endif + + return r; +} + +DECLSPEC u64 blake2b_rot24_S (const u64 a) +{ + #if defined IS_NV + + vconv64_t in; + + in.v64 = a; + + vconv64_t out; + + out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x2107); + out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x6543); + + return out.v64; + + #elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM + + vconv64_t in; + + in.v64 = a; + + vconv64_t out; + + out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x02010007); + out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x06050403); + + return out.v64; + + #else + + return hc_rotr64_S (a, 16); + + #endif +} + +DECLSPEC u64x blake2b_rot24 (const u64x a) +{ + u64x r; + + #if VECT_SIZE == 1 + r = blake2b_rot24_S (a); + #endif + + #if VECT_SIZE >= 2 + r.s0 = blake2b_rot24_S (a.s0); + r.s1 = blake2b_rot24_S (a.s1); + #endif + + #if VECT_SIZE >= 4 + r.s2 = blake2b_rot24_S (a.s2); + r.s3 = blake2b_rot24_S (a.s3); + #endif + + #if VECT_SIZE >= 8 + r.s4 = blake2b_rot24_S (a.s4); + r.s5 = blake2b_rot24_S (a.s5); + r.s6 = blake2b_rot24_S (a.s6); + r.s7 = blake2b_rot24_S (a.s7); + #endif + + #if VECT_SIZE >= 16 + r.s8 = blake2b_rot24_S (a.s8); + r.s9 = blake2b_rot24_S (a.s9); + r.sa = blake2b_rot24_S (a.sa); + r.sb = blake2b_rot24_S (a.sb); + r.sc = blake2b_rot24_S (a.sc); + r.sd = blake2b_rot24_S (a.sd); + r.se = blake2b_rot24_S (a.se); + r.sf = blake2b_rot24_S (a.sf); + #endif + + return r; +} + +DECLSPEC u64 blake2b_rot32_S (const u64 a) +{ + vconv64_t in; + + in.v64 = a; + + vconv64_t out; + + out.v32.a = in.v32.b; + out.v32.b = in.v32.a; + + return out.v64; +} + +DECLSPEC u64x blake2b_rot32 (const u64x a) +{ + u64x r; + + #if VECT_SIZE == 1 + r = blake2b_rot32_S (a); + #endif + + #if VECT_SIZE >= 2 + r.s0 = blake2b_rot32_S (a.s0); + r.s1 = blake2b_rot32_S (a.s1); + #endif + + #if VECT_SIZE >= 4 + r.s2 = blake2b_rot32_S (a.s2); + r.s3 = blake2b_rot32_S (a.s3); + #endif + + #if VECT_SIZE >= 8 + r.s4 = blake2b_rot32_S (a.s4); + r.s5 = blake2b_rot32_S (a.s5); + r.s6 = blake2b_rot32_S (a.s6); + r.s7 = blake2b_rot32_S (a.s7); + #endif + + #if VECT_SIZE >= 16 + r.s8 = blake2b_rot32_S (a.s8); + r.s9 = blake2b_rot32_S (a.s9); + r.sa = blake2b_rot32_S (a.sa); + r.sb = blake2b_rot32_S (a.sb); + r.sc = blake2b_rot32_S (a.sc); + r.sd = blake2b_rot32_S (a.sd); + r.se = blake2b_rot32_S (a.se); + r.sf = blake2b_rot32_S (a.sf); + #endif + + return r; +} + DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const int len, const u64 f0) { const u64 t0 = hl32_to_64_S (0, len); diff --git a/OpenCL/inc_hash_blake2b.h b/OpenCL/inc_hash_blake2b.h index afcacf368..4195e28b4 100644 --- a/OpenCL/inc_hash_blake2b.h +++ b/OpenCL/inc_hash_blake2b.h @@ -9,14 +9,23 @@ #define BLAKE2B_UPDATE 0 #define BLAKE2B_FINAL -1 +DECLSPEC u64 blake2b_rot16_S (const u64 a); +DECLSPEC u64x blake2b_rot16 (const u64x a); + +DECLSPEC u64 blake2b_rot24_S (const u64 a); +DECLSPEC u64x blake2b_rot24 (const u64x a); + +DECLSPEC u64 blake2b_rot32_S (const u64 a); +DECLSPEC u64x blake2b_rot32 (const u64x a); + #define BLAKE2B_G(k0,k1,a,b,c,d) \ { \ a = a + b + m[k0]; \ - d = hc_rotr64_S (d ^ a, 32); \ + d = blake2b_rot32_S (d ^ a); \ c = c + d; \ - b = hc_rotr64_S (b ^ c, 24); \ + b = blake2b_rot24_S (b ^ c); \ a = a + b + m[k1]; \ - d = hc_rotr64_S (d ^ a, 16); \ + d = blake2b_rot16_S (d ^ a); \ c = c + d; \ b = hc_rotr64_S (b ^ c, 63); \ } @@ -36,11 +45,11 @@ #define BLAKE2B_G_VECTOR(k0,k1,a,b,c,d) \ { \ a = a + b + m[k0]; \ - d = hc_rotr64 (d ^ a, 32); \ + d = blake2b_rot32 (d ^ a); \ c = c + d; \ - b = hc_rotr64 (b ^ c, 24); \ + b = blake2b_rot24 (b ^ c); \ a = a + b + m[k1]; \ - d = hc_rotr64 (d ^ a, 16); \ + d = blake2b_rot16 (d ^ a); \ c = c + d; \ b = hc_rotr64 (b ^ c, 63); \ }