diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl index e93206e44..e3f5b4d1c 100644 --- a/OpenCL/inc_hash_sha224.cl +++ b/OpenCL/inc_hash_sha224.cl @@ -104,6 +104,11 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con ROUND_STEP_S (0); + #ifdef IS_CUDA + ROUND_EXPAND_S (); ROUND_STEP_S (16); + ROUND_EXPAND_S (); ROUND_STEP_S (32); + ROUND_EXPAND_S (); ROUND_STEP_S (48); + #else #ifdef _unroll #pragma unroll #endif @@ -111,6 +116,7 @@ DECLSPEC void sha224_transform (const u32 *w0, const u32 *w1, const u32 *w2, con { ROUND_EXPAND_S (); ROUND_STEP_S (i); } + #endif #undef ROUND_EXPAND_S #undef ROUND_STEP_S diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl index de2bd5897..d6d0a69b0 100644 --- a/OpenCL/inc_hash_sha256.cl +++ b/OpenCL/inc_hash_sha256.cl @@ -104,6 +104,11 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con ROUND_STEP_S (0); + #ifdef IS_CUDA + ROUND_EXPAND_S (); ROUND_STEP_S (16); + ROUND_EXPAND_S (); ROUND_STEP_S (32); + ROUND_EXPAND_S (); ROUND_STEP_S (48); + #else #ifdef _unroll #pragma unroll #endif @@ -111,6 +116,7 @@ DECLSPEC void sha256_transform (const u32 *w0, const u32 *w1, const u32 *w2, con { ROUND_EXPAND_S (); ROUND_STEP_S (i); } + #endif #undef ROUND_EXPAND_S #undef ROUND_STEP_S diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl index ea26ec734..61e2f5d3a 100644 --- a/OpenCL/inc_hash_sha384.cl +++ b/OpenCL/inc_hash_sha384.cl @@ -108,6 +108,12 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con ROUND_STEP_S (0); + #ifdef IS_CUDA + ROUND_EXPAND_S (); ROUND_STEP_S (16); + ROUND_EXPAND_S (); ROUND_STEP_S (32); + ROUND_EXPAND_S (); ROUND_STEP_S (48); + ROUND_EXPAND_S (); ROUND_STEP_S (64); + #else #ifdef _unroll #pragma unroll #endif @@ -115,6 +121,7 @@ DECLSPEC void sha384_transform (const u32 *w0, const u32 *w1, const u32 *w2, con { ROUND_EXPAND_S (); ROUND_STEP_S (i); } + #endif #undef ROUND_EXPAND_S #undef ROUND_STEP_S diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl index 783a66fbe..6dc91f368 100644 --- a/OpenCL/inc_hash_sha512.cl +++ b/OpenCL/inc_hash_sha512.cl @@ -108,6 +108,12 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con ROUND_STEP_S (0); + #ifdef IS_CUDA + ROUND_EXPAND_S (); ROUND_STEP_S (16); + ROUND_EXPAND_S (); ROUND_STEP_S (32); + ROUND_EXPAND_S (); ROUND_STEP_S (48); + ROUND_EXPAND_S (); ROUND_STEP_S (64); + #else #ifdef _unroll #pragma unroll #endif @@ -115,6 +121,7 @@ DECLSPEC void sha512_transform (const u32 *w0, const u32 *w1, const u32 *w2, con { ROUND_EXPAND_S (); ROUND_STEP_S (i); } + #endif #undef ROUND_EXPAND_S #undef ROUND_STEP_S diff --git a/OpenCL/m01700_a0-optimized.cl b/OpenCL/m01700_a0-optimized.cl index bd71b1678..0d6ddb337 100644 --- a/OpenCL/m01700_a0-optimized.cl +++ b/OpenCL/m01700_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01700_a1-optimized.cl b/OpenCL/m01700_a1-optimized.cl index 2716dbdec..abee4dfcb 100644 --- a/OpenCL/m01700_a1-optimized.cl +++ b/OpenCL/m01700_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01700_a3-optimized.cl b/OpenCL/m01700_a3-optimized.cl index 5e2820592..c4d8ee016 100644 --- a/OpenCL/m01700_a3-optimized.cl +++ b/OpenCL/m01700_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01710_a0-optimized.cl b/OpenCL/m01710_a0-optimized.cl index 6f3130ac9..a5a53e831 100644 --- a/OpenCL/m01710_a0-optimized.cl +++ b/OpenCL/m01710_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01710_a1-optimized.cl b/OpenCL/m01710_a1-optimized.cl index 8d8a87380..931142cae 100644 --- a/OpenCL/m01710_a1-optimized.cl +++ b/OpenCL/m01710_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01710_a3-optimized.cl b/OpenCL/m01710_a3-optimized.cl index 353d1395b..a82f949ac 100644 --- a/OpenCL/m01710_a3-optimized.cl +++ b/OpenCL/m01710_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01720_a0-optimized.cl b/OpenCL/m01720_a0-optimized.cl index 14503946d..c331365f1 100644 --- a/OpenCL/m01720_a0-optimized.cl +++ b/OpenCL/m01720_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01720_a1-optimized.cl b/OpenCL/m01720_a1-optimized.cl index fbacd956b..aa93dc2c9 100644 --- a/OpenCL/m01720_a1-optimized.cl +++ b/OpenCL/m01720_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl index 168cb7f96..891634dd4 100644 --- a/OpenCL/m01720_a3-optimized.cl +++ b/OpenCL/m01720_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01730_a0-optimized.cl b/OpenCL/m01730_a0-optimized.cl index 2d1d4d6bc..f5da15e7f 100644 --- a/OpenCL/m01730_a0-optimized.cl +++ b/OpenCL/m01730_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01730_a1-optimized.cl b/OpenCL/m01730_a1-optimized.cl index 7515d2cc0..f3cd8d89a 100644 --- a/OpenCL/m01730_a1-optimized.cl +++ b/OpenCL/m01730_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01730_a3-optimized.cl b/OpenCL/m01730_a3-optimized.cl index d2f01afd8..e00e5f4ae 100644 --- a/OpenCL/m01730_a3-optimized.cl +++ b/OpenCL/m01730_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01740_a0-optimized.cl b/OpenCL/m01740_a0-optimized.cl index 717aba0c0..ee38662e8 100644 --- a/OpenCL/m01740_a0-optimized.cl +++ b/OpenCL/m01740_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01740_a1-optimized.cl b/OpenCL/m01740_a1-optimized.cl index aa765e337..0ae6984e0 100644 --- a/OpenCL/m01740_a1-optimized.cl +++ b/OpenCL/m01740_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl index 8b6322f85..4b7b1d3df 100644 --- a/OpenCL/m01740_a3-optimized.cl +++ b/OpenCL/m01740_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m08000_a0-optimized.cl b/OpenCL/m08000_a0-optimized.cl index f259ea250..310bebbeb 100644 --- a/OpenCL/m08000_a0-optimized.cl +++ b/OpenCL/m08000_a0-optimized.cl @@ -86,6 +86,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +98,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) { ROUND_EXPAND (); ROUND_STEP (i); } + #endif digest[0] += a; digest[1] += b; @@ -137,6 +143,11 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); + #ifdef IS_CUDA + ROUND_STEP_Z (16); + ROUND_STEP_Z (32); + ROUND_STEP_Z (48); + #else #ifdef _unroll #pragma unroll #endif @@ -144,6 +155,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) { ROUND_STEP_Z (i); } + #endif digest[0] += a; digest[1] += b; diff --git a/OpenCL/m08000_a1-optimized.cl b/OpenCL/m08000_a1-optimized.cl index 599364f44..89ea42a57 100644 --- a/OpenCL/m08000_a1-optimized.cl +++ b/OpenCL/m08000_a1-optimized.cl @@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) { ROUND_EXPAND (); ROUND_STEP (i); } + #endif digest[0] += a; digest[1] += b; @@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); + #ifdef IS_CUDA + ROUND_STEP_Z (16); + ROUND_STEP_Z (32); + ROUND_STEP_Z (48); + #else #ifdef _unroll #pragma unroll #endif @@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) { ROUND_STEP_Z (i); } + #endif digest[0] += a; digest[1] += b; diff --git a/OpenCL/m08000_a3-optimized.cl b/OpenCL/m08000_a3-optimized.cl index 3068a6621..fa76a3b72 100644 --- a/OpenCL/m08000_a3-optimized.cl +++ b/OpenCL/m08000_a3-optimized.cl @@ -84,6 +84,11 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +96,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) { ROUND_EXPAND (); ROUND_STEP (i); } + #endif digest[0] += a; digest[1] += b; @@ -135,6 +141,11 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); + #ifdef IS_CUDA + ROUND_STEP_Z (16); + ROUND_STEP_Z (32); + ROUND_STEP_Z (48); + #else #ifdef _unroll #pragma unroll #endif @@ -142,6 +153,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) { ROUND_STEP_Z (i); } + #endif digest[0] += a; digest[1] += b; diff --git a/OpenCL/m10800_a0-optimized.cl b/OpenCL/m10800_a0-optimized.cl index e9cfd2167..396b389a6 100644 --- a/OpenCL/m10800_a0-optimized.cl +++ b/OpenCL/m10800_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m10800_a1-optimized.cl b/OpenCL/m10800_a1-optimized.cl index f7828aa9f..11aa95dbd 100644 --- a/OpenCL/m10800_a1-optimized.cl +++ b/OpenCL/m10800_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m10800_a3-optimized.cl b/OpenCL/m10800_a3-optimized.cl index f19e1b224..cef22d51f 100644 --- a/OpenCL/m10800_a3-optimized.cl +++ b/OpenCL/m10800_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m15000_a0-optimized.cl b/OpenCL/m15000_a0-optimized.cl index a4fe67e48..7b73564a3 100644 --- a/OpenCL/m15000_a0-optimized.cl +++ b/OpenCL/m15000_a0-optimized.cl @@ -86,6 +86,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -93,6 +99,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m15000_a1-optimized.cl b/OpenCL/m15000_a1-optimized.cl index e410b3102..1bcd7a983 100644 --- a/OpenCL/m15000_a1-optimized.cl +++ b/OpenCL/m15000_a1-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/OpenCL/m15000_a3-optimized.cl b/OpenCL/m15000_a3-optimized.cl index 5ff6d7bfb..cf1ff1432 100644 --- a/OpenCL/m15000_a3-optimized.cl +++ b/OpenCL/m15000_a3-optimized.cl @@ -84,6 +84,12 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); + #ifdef IS_CUDA + ROUND_EXPAND (); ROUND_STEP (16); + ROUND_EXPAND (); ROUND_STEP (32); + ROUND_EXPAND (); ROUND_STEP (48); + ROUND_EXPAND (); ROUND_STEP (64); + #else #ifdef _unroll #pragma unroll #endif @@ -91,6 +97,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 { ROUND_EXPAND (); ROUND_STEP (i); } + #endif /* rev digest[0] += a; diff --git a/src/autotune.c b/src/autotune.c index 43b5b46bb..11dc8c1d1 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -47,6 +47,7 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par return exec_msec_prev; } +/* static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) { hashconfig_t *hashconfig = hashcat_ctx->hashconfig; @@ -93,6 +94,7 @@ static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t * return exec_msec_prev; } +*/ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { @@ -261,6 +263,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param const u32 kernel_accel_orig = kernel_accel; const u32 kernel_loops_orig = kernel_loops; + double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + for (int i = 1; i < STEPS_CNT; i++) { const u32 kernel_accel_try = kernel_accel_orig * (1u << i); @@ -272,6 +276,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (kernel_loops_try > kernel_loops_max) continue; if (kernel_loops_try < kernel_loops_min) break; + // do a real test + + const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try); + + if (exec_msec_prev < exec_msec) break; + + exec_msec_prev = exec_msec; + + // so far, so good! save + kernel_accel = kernel_accel_try; kernel_loops = kernel_loops_try; @@ -299,6 +313,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // start finding best thread count is easier. // it's either the preferred or the maximum thread count + /* const u32 kernel_threads_min = device_param->kernel_threads_min; const u32 kernel_threads_max = device_param->kernel_threads_max; @@ -334,6 +349,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } } } + */ if (device_param->is_cuda == true) {