From b5cb29ad1c6eb131dbed091f0ee6f0db848e817f Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 16 May 2016 21:30:21 +0200 Subject: [PATCH] Minimal psafe2 increase and autotune fix --- OpenCL/m09000.cl | 22 ++++++++++++---------- OpenCL/types_ocl.c | 7 ++++++- src/hashcat.c | 27 +++++++++++++++------------ 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/OpenCL/m09000.cl b/OpenCL/m09000.cl index 9e12988b6..210b62068 100644 --- a/OpenCL/m09000.cl +++ b/OpenCL/m09000.cl @@ -320,16 +320,16 @@ __constant u32 c_pbox[18] = #endif #ifdef IS_NV -#define BF_ROUND(L,R,N) \ -{ \ - u32 tmp; \ - \ - tmp = S0[__bfe ((L), 24, 8)]; \ - tmp += S1[__bfe ((L), 16, 8)]; \ - tmp ^= S2[__bfe ((L), 8, 8)]; \ - tmp += S3[__bfe ((L), 0, 8)]; \ - \ - (R) ^= tmp ^ P[(N)]; \ +#define BF_ROUND(L,R,N) \ +{ \ + u32 tmp; \ + \ + tmp = S0[__bfe_S ((L), 24, 8)]; \ + tmp += S1[__bfe_S ((L), 16, 8)]; \ + tmp ^= S2[__bfe_S ((L), 8, 8)]; \ + tmp += S3[__bfe_S ((L), 0, 8)]; \ + \ + (R) ^= tmp ^ P[(N)]; \ } #endif @@ -747,6 +747,7 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m09000_loop (__glo u32 P[18]; + #pragma unroll for (u32 i = 0; i < 18; i++) { P[i] = tmps[gid].P[i]; @@ -762,6 +763,7 @@ __kernel void __attribute__((reqd_work_group_size (8, 1, 1))) m09000_loop (__glo __local u32 *S2 = S2_all[lid]; __local u32 *S3 = S3_all[lid]; + #pragma unroll for (u32 i = 0; i < 256; i++) { S0[i] = tmps[gid].S0[i]; diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index 90e034362..be492db82 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -257,7 +257,12 @@ inline u64x rotl64 (const u64x a, const u32 n) return rotr64 (a, 64 - n); } -inline u32 __bfe (const u32 a, const u32 b, const u32 c) +inline u32x __bfe (const u32x a, const u32x b, const u32x c) +{ + return amd_bfe (a, b, c); +} + +inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) { return amd_bfe (a, b, c); } diff --git a/src/hashcat.c b/src/hashcat.c index 1e7393e99..def3c645c 100644 --- a/src/hashcat.c +++ b/src/hashcat.c @@ -2900,22 +2900,25 @@ static void autotune (hc_device_param_t *device_param) // balancing the workload turns out to be very efficient - const u32 kernel_power_balance = kernel_accel * kernel_loops; + if (kernel_loops_min != kernel_loops_max) + { + const u32 kernel_power_balance = kernel_accel * kernel_loops; - u32 sqrtv; + u32 sqrtv; - for (sqrtv = 1; sqrtv < 0x100000; sqrtv++) - { - if ((sqrtv * sqrtv) >= kernel_power_balance) break; - } + for (sqrtv = 1; sqrtv < 0x100000; sqrtv++) + { + if ((sqrtv * sqrtv) >= kernel_power_balance) break; + } - const u32 kernel_accel_try = sqrtv; - const u32 kernel_loops_try = sqrtv; + const u32 kernel_accel_try = sqrtv; + const u32 kernel_loops_try = sqrtv; - if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min)) - { - kernel_accel = kernel_accel_try; - kernel_loops = kernel_loops_try; + if ((kernel_accel_try <= kernel_accel_max) && (kernel_loops_try >= kernel_loops_min)) + { + kernel_accel = kernel_accel_try; + kernel_loops = kernel_loops_try; + } } // reset fake words