From 1c280e4a6e03697641511ab29385502def8b6b07 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 2 Aug 2018 14:20:04 +0200 Subject: [PATCH] Small performance boost for bcrypt on CPU --- OpenCL/m03200-pure.cl | 83 +++++++++++-------------------------------- 1 file changed, 20 insertions(+), 63 deletions(-) diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index ee400efbc..62de27572 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -286,59 +286,17 @@ __constant u32a c_sbox3[256] = 0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6 }; -#ifdef IS_AMD -#define BF_ROUND(L,R,N) \ -{ \ - uchar4 c = as_uchar4 ((L)); \ - \ - u32 tmp; \ - \ - tmp = S0[c.s3]; \ - tmp += S1[c.s2]; \ - tmp ^= S2[c.s1]; \ - tmp += S3[c.s0]; \ - \ - (R) ^= tmp ^ P[(N)]; \ -} -#endif - -#ifdef IS_NV -#define BF_ROUND(L,R,N) \ -{ \ - u32 tmp; \ +#define BF_ROUND(L,R,N) \ +{ \ + u32 tmp; \ \ - tmp = S0[hc_bfe ((L), 24, 8)]; \ - tmp += S1[hc_bfe ((L), 16, 8)]; \ - tmp ^= S2[hc_bfe ((L), 8, 8)]; \ - tmp += S3[hc_bfe ((L), 0, 8)]; \ + tmp = S0[hc_bfe_S ((L), 24, 8)]; \ + tmp += S1[hc_bfe_S ((L), 16, 8)]; \ + tmp ^= S2[hc_bfe_S ((L), 8, 8)]; \ + tmp += S3[hc_bfe_S ((L), 0, 8)]; \ \ - (R) ^= tmp ^ P[(N)]; \ + (R) ^= tmp ^ P[(N)]; \ } -#endif - -#ifdef IS_GENERIC -#define BF_ROUND(L,R,N) \ -{ \ - uchar4 c = as_uchar4 ((L)); \ - \ - u32 tmp; \ - \ - tmp = S0[c.s3]; \ - tmp += S1[c.s2]; \ - tmp ^= S2[c.s1]; \ - tmp += S3[c.s0]; \ - \ - (R) ^= tmp ^ P[(N)]; \ -} -#endif - -// temporary hack for Apple Iris GPUs (with as little performance drop as possible) - -#if defined (IS_APPLE) && defined (IS_GPU) -#define TMP_TYPE u32 -#else -#define TMP_TYPE u32 -#endif #define BF_ENCRYPT(L,R) \ { \ @@ -361,7 +319,7 @@ __constant u32a c_sbox3[256] = BF_ROUND (L, R, 15); \ BF_ROUND (R, L, 16); \ \ - TMP_TYPE tmp; \ + u32 tmp; \ \ tmp = R; \ R = L; \ @@ -370,24 +328,23 @@ __constant u32a c_sbox3[256] = L ^= P[17]; \ } -DECLSPEC void expand_key (u32 *E, const u32 *W, const u32 len) +DECLSPEC void expand_key (u32 *E, u32 *W, const int len) { - u8 *E_cur = (u8 *) E; - u8 *E_stop = E_cur + 72; + u8 *E_ptr = (u8 *) E; + u8 *W_ptr = (u8 *) W; - while (E_cur < E_stop) + for (int pos = 0; pos < 72; pos++) // pos++ is not a bug, we actually want that zero byte here { - u8 *W_cur = (u8 *) W; - u8 *W_stop = W_cur + len; + const int left = 72 - pos; - while (W_cur < W_stop) + const int sz = (len < left) ? len : left; // should be MIN() + + for (int i = 0; i < sz; i++) { - *E_cur++ = *W_cur++; - - if (E_cur == E_stop) return; + E_ptr[pos + i] = W_ptr[i]; } - *E_cur++ = 0; + pos += sz; } } @@ -425,7 +382,7 @@ __kernel void __attribute__((reqd_work_group_size(8, 1, 1))) m03200_init (__glob w[16] = pws[gid].i[16]; w[17] = pws[gid].i[17]; - u32 E[18]; + u32 E[18] = { 0 }; expand_key (E, w, pw_len);