diff --git a/OpenCL/m01800.cl b/OpenCL/m01800.cl index 8cff71db4..ae25f1d51 100644 --- a/OpenCL/m01800.cl +++ b/OpenCL/m01800.cl @@ -20,6 +20,14 @@ #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" +// Buggy drivers... + +#ifdef IS_AMD +#define STATE_DECL volatile +#else +#define STATE_DECL +#endif + #define PUTCHAR64_BE(a,p,c) ((u8 *)(a))[(p) ^ 7] = (u8) (c) #define GETCHAR64_BE(a,p) ((u8 *)(a))[(p) ^ 7] @@ -74,14 +82,14 @@ static void sha512_transform (const u64 w[16], u64 digest[8]) u64 we_t = w[14]; u64 wf_t = w[15]; - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + STATE_DECL u64 a = digest[0]; + STATE_DECL u64 b = digest[1]; + STATE_DECL u64 c = digest[2]; + STATE_DECL u64 d = digest[3]; + STATE_DECL u64 e = digest[4]; + STATE_DECL u64 f = digest[5]; + STATE_DECL u64 g = digest[6]; + STATE_DECL u64 h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -141,73 +149,6 @@ static void sha512_transform (const u64 w[16], u64 digest[8]) digest[7] += h; } -#ifdef IS_AMD -static void sha512_transform_workaround (const u64 w[16], u64 digest[8]) -{ - u64 w0_t = w[ 0]; - u64 w1_t = w[ 1]; - u64 w2_t = w[ 2]; - u64 w3_t = w[ 3]; - u64 w4_t = w[ 4]; - u64 w5_t = w[ 5]; - u64 w6_t = w[ 6]; - u64 w7_t = w[ 7]; - u64 w8_t = w[ 8]; - u64 w9_t = w[ 9]; - u64 wa_t = w[10]; - u64 wb_t = w[11]; - u64 wc_t = w[12]; - u64 wd_t = w[13]; - u64 we_t = w[14]; - u64 wf_t = w[15]; - - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; - - #define ROUND_EXPAND_WO() \ - { \ - w0_t = SHA512_EXPAND_WO (we_t, w9_t, w1_t, w0_t); \ - w1_t = SHA512_EXPAND_WO (wf_t, wa_t, w2_t, w1_t); \ - w2_t = SHA512_EXPAND_WO (w0_t, wb_t, w3_t, w2_t); \ - w3_t = SHA512_EXPAND_WO (w1_t, wc_t, w4_t, w3_t); \ - w4_t = SHA512_EXPAND_WO (w2_t, wd_t, w5_t, w4_t); \ - w5_t = SHA512_EXPAND_WO (w3_t, we_t, w6_t, w5_t); \ - w6_t = SHA512_EXPAND_WO (w4_t, wf_t, w7_t, w6_t); \ - w7_t = SHA512_EXPAND_WO (w5_t, w0_t, w8_t, w7_t); \ - w8_t = SHA512_EXPAND_WO (w6_t, w1_t, w9_t, w8_t); \ - w9_t = SHA512_EXPAND_WO (w7_t, w2_t, wa_t, w9_t); \ - wa_t = SHA512_EXPAND_WO (w8_t, w3_t, wb_t, wa_t); \ - wb_t = SHA512_EXPAND_WO (w9_t, w4_t, wc_t, wb_t); \ - wc_t = SHA512_EXPAND_WO (wa_t, w5_t, wd_t, wc_t); \ - wd_t = SHA512_EXPAND_WO (wb_t, w6_t, we_t, wd_t); \ - we_t = SHA512_EXPAND_WO (wc_t, w7_t, wf_t, we_t); \ - wf_t = SHA512_EXPAND_WO (wd_t, w8_t, w0_t, wf_t); \ - } - - ROUND_STEP (0); - - for (int i = 16; i < 80; i += 16) - { - ROUND_EXPAND_WO (); ROUND_STEP (i); - } - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; - digest[4] += e; - digest[5] += f; - digest[6] += g; - digest[7] += h; -} -#endif - static void sha512_init (sha512_ctx_t *sha512_ctx) { sha512_ctx->state[0] = SHA512M_A; @@ -245,17 +186,7 @@ static void sha512_update (sha512_ctx_t *sha512_ctx, const u64 *buf, int len) PUTCHAR64_BE (sha512_ctx->buf, pos++, GETCHAR64_BE (buf, i)); } - #ifdef IS_AMD - sha512_transform_workaround (sha512_ctx->buf, sha512_ctx->state); - #endif - - #ifdef IS_NV sha512_transform (sha512_ctx->buf, sha512_ctx->state); - #endif - - #ifdef IS_GENERIC - sha512_transform (sha512_ctx->buf, sha512_ctx->state); - #endif len -= cnt; diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index 77264b84b..f8991257a 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -177,24 +177,19 @@ static inline u32 rotl32_S (const u32 a, const u32 n) static inline u64 rotr64_S (const u64 a, const u32 n) { - u64 r; - #if DEVICE_TYPE == DEVICE_TYPE_CPU - r = rotate (a, (u64) 64 - n); + const u64 r = rotate (a, (u64) 64 - n); #else - uint2 a2 = as_uint2 (a); + const u32 a0 = h32_from_64_S (a); + const u32 a1 = l32_from_64_S (a); - uint2 t; + const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); + const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) - : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) - : amd_bitalign (a2.s0, a2.s1, n); - - r = as_ulong (t); + const u64 r = hl32_to_64_S (t0, t1); #endif @@ -238,159 +233,20 @@ static inline u32x rotl32 (const u32x a, const u32 n) static inline u64x rotr64 (const u64x a, const u32 n) { - u64x r; - #if DEVICE_TYPE == DEVICE_TYPE_CPU - r = rotate (a, (u64) 64 - n); + const u64x r = rotate (a, (u64) 64 - n); #else - uint2 a2; - uint2 t; + const u32x a0 = h32_from_64 (a); + const u32x a1 = l32_from_64 (a); - #if VECT_SIZE == 1 + const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); + const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - a2 = as_uint2 (a); + const u64x r = hl32_to_64 (t0, t1); - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r = as_ulong (t); - - #elif VECT_SIZE == 2 - - { - a2 = as_uint2 (a.s0); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s0 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s1); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s1 = as_ulong (t); - } - - #elif VECT_SIZE == 4 - - { - a2 = as_uint2 (a.s0); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s0 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s1); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s1 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s2); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s2 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s3); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s3 = as_ulong (t); - } - - #elif VECT_SIZE == 8 - - { - a2 = as_uint2 (a.s0); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s0 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s1); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s1 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s2); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s2 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s3); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s3 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s4); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s4 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s5); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s5 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s6); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s6 = as_ulong (t); - } - - { - a2 = as_uint2 (a.s7); - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); - - r.s7 = as_ulong (t); - } - - #endif #endif return r; diff --git a/include/kernel_functions.c b/include/kernel_functions.c index e34480875..c7d1e0f22 100644 --- a/include/kernel_functions.c +++ b/include/kernel_functions.c @@ -265,16 +265,13 @@ #define SHA384_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - u64x temp0; \ - temp0 = K; \ - temp0 += x; \ - temp0 += h; \ - temp0 += SHA384_S1 (e); \ - temp0 += F0 (e, f, g); \ - d += temp0; \ - h = SHA384_S0 (a); \ - h += F1 (a, b, c); \ - h += temp0; \ + h += K; \ + h += x; \ + h += SHA384_S1 (e); \ + h += F0 (e, f, g); \ + d += h; \ + h += SHA384_S0 (a); \ + h += F1 (a, b, c); \ } #define SHA384_EXPAND(x,y,z,w) (SHA384_S3 (x) + y + SHA384_S2 (z) + w) @@ -309,24 +306,16 @@ #define SHA512_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - u64x temp0; \ - temp0 = K; \ - temp0 += x; \ - temp0 += h; \ - temp0 += SHA512_S1 (e); \ - temp0 += F0 (e, f, g); \ - d += temp0; \ - h = SHA512_S0 (a); \ - h += F1 (a, b, c); \ - h += temp0; \ + h += K; \ + h += x; \ + h += SHA512_S1 (e); \ + h += F0 (e, f, g); \ + d += h; \ + h += SHA512_S0 (a); \ + h += F1 (a, b, c); \ } #define SHA512_EXPAND(x,y,z,w) (SHA512_S3 (x) + y + SHA512_S2 (z) + w) - -#define SHA512_S2_WO(x) (rotate ((x), 64- 1ull) ^ rotate ((x), 64- 8ull) ^ SHIFT_RIGHT_64 ((x), 7)) -#define SHA512_S3_WO(x) (rotate ((x), 64-19ull) ^ rotate ((x), 64-61ull) ^ SHIFT_RIGHT_64 ((x), 6)) - -#define SHA512_EXPAND_WO(x,y,z,w) (SHA512_S3_WO (x) + y + SHA512_S2_WO (z) + w) #endif #ifdef _RIPEMD160_