From 72e3821a4cffa30564f65b266b6044bc6f854173 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 5 May 2016 23:21:15 +0200 Subject: [PATCH] Simplify auto-tuning and benchmark routines Decrease the time it takes to run a benchmark Removed --benchmark-repeat, it creates no advantage Fix some wording related to drivers Dropped special 64-bit rotate() handling for NV Cleanup SHA384 Cleanup try_run() --- OpenCL/m10800_a0.cl | 9 +- OpenCL/m10800_a1.cl | 9 +- OpenCL/m10800_a3.cl | 9 +- OpenCL/types_ocl.c | 353 +------------------------------------------- docs/changes.txt | 1 + docs/readme.txt | 4 +- src/oclHashcat.c | 291 ++++++++++-------------------------- 7 files changed, 99 insertions(+), 577 deletions(-) diff --git a/OpenCL/m10800_a0.cl b/OpenCL/m10800_a0.cl index f28d7983a..78ad9984c 100644 --- a/OpenCL/m10800_a0.cl +++ b/OpenCL/m10800_a0.cl @@ -116,10 +116,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con ROUND_STEP (0); - ROUND_EXPAND (); ROUND_STEP (16); - ROUND_EXPAND (); ROUND_STEP (32); - ROUND_EXPAND (); ROUND_STEP (48); - ROUND_EXPAND (); ROUND_STEP (64); + #pragma unroll + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } /* rev digest[0] += a; diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 0e0e864c0..24762c04b 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -114,10 +114,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con ROUND_STEP (0); - ROUND_EXPAND (); ROUND_STEP (16); - ROUND_EXPAND (); ROUND_STEP (32); - ROUND_EXPAND (); ROUND_STEP (48); - ROUND_EXPAND (); ROUND_STEP (64); + #pragma unroll + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } /* rev digest[0] += a; diff --git a/OpenCL/m10800_a3.cl b/OpenCL/m10800_a3.cl index 3ffe5a05b..c6df6715d 100644 --- a/OpenCL/m10800_a3.cl +++ b/OpenCL/m10800_a3.cl @@ -114,10 +114,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con ROUND_STEP (0); - ROUND_EXPAND (); ROUND_STEP (16); - ROUND_EXPAND (); ROUND_STEP (32); - ROUND_EXPAND (); ROUND_STEP (48); - ROUND_EXPAND (); ROUND_STEP (64); + #pragma unroll + for (int i = 16; i < 80; i += 16) + { + ROUND_EXPAND (); ROUND_STEP (i); + } /* rev digest[0] += a; diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index ff464f379..c41c526d4 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -311,40 +311,10 @@ inline u32 rotl32_S (const u32 a, const u32 n) return rotate (a, n); } -#if CUDA_ARCH >= 350 -inline u64 rotr64_S (const u64 a, const u32 n) -{ - u32 il; - u32 ir; - - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a)); - - u32 tl; - u32 tr; - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - u64 r; - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr)); - - return r; -} -#else inline u64 rotr64_S (const u64 a, const u32 n) { return rotate (a, (u64) 64 - n); } -#endif inline u64 rotl64_S (const u64 a, const u32 n) { @@ -390,335 +360,14 @@ inline u32x rotl32 (const u32x a, const u32 n) return rotate (a, n); } -#if CUDA_ARCH >= 350 -inline u64x rotr64 (const u64x a, const u32 n) -{ - u64x r; - - u32 il; - u32 ir; - u32 tl; - u32 tr; - - #if VECT_SIZE == 1 - - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr)); - - #endif - - #if VECT_SIZE >= 2 - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s0)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s1)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tl), "r"(tr)); - } - - #endif - - #if VECT_SIZE >= 4 - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s2)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s3)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tl), "r"(tr)); - } - - #endif - - #if VECT_SIZE >= 8 - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s4)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s5)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s6)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s7)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tl), "r"(tr)); - } - - #endif - - #if VECT_SIZE >= 16 - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s8)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s8) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s9)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s9) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sa)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sa) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sb)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sb) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sc)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sc) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sd)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sd) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.se)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.se) : "r"(tl), "r"(tr)); - } - - { - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sf)); - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sf) : "r"(tl), "r"(tr)); - } - - #endif - - return r; -} -#else inline u64x rotr64 (const u64x a, const u32 n) { return rotate (a, (u64) 64 - n); } -#endif inline u64x rotl64 (const u64x a, const u32 n) { - return rotr64 (a, (u64) 64 - n); + return rotate (a, (u64) n); } inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) diff --git a/docs/changes.txt b/docs/changes.txt index f6ec3b5e0..e69b45f2e 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -118,3 +118,4 @@ It combines all features of all hashcat projects in one project. - Added -cl-std=CL1.1 to all kernel build options - Created environment variable to inform NVidia OpenCL runtime to not create its own kernel cache - Created environment variable to inform pocl OpenCL runtime to not create its own kernel cache +- Dropped special 64-bit rotate() handling for NV, it seems that they've added it to their OpenCL runtime diff --git a/docs/readme.txt b/docs/readme.txt index fabead10e..2dc7ec531 100644 --- a/docs/readme.txt +++ b/docs/readme.txt @@ -1,9 +1,9 @@ oclHashcat v2.10 ================ -AMD users require Catalyst 14.9 or later (recommended 15.12 or later) +AMD users require AMD drivers 14.9 or later (recommended 15.12 or later) Intel users require Intel OpenCL Runtime 14.2 or later (recommended 15.1 or later) -NVidia users require ForceWare 346.59 or later (recommended 358.09 or later) +NVidia users require NVidia drivers 346.59 or later (recommended 361.x or later) ## ## Features diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 55525f3e3..114cb3d3d 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -33,7 +33,6 @@ double TARGET_MS_PROFILE[3] = { 8, 16, 96 }; #define MARKOV_DISABLE 0 #define MARKOV_CLASSIC 0 #define BENCHMARK 0 -#define BENCHMARK_REPEATS 100 #define RESTORE 0 #define RESTORE_TIMER 60 #define RESTORE_DISABLE 0 @@ -2612,7 +2611,7 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const if (rc != 0) { // NOTE: clEnqueueFillBuffer () always fails with -59 - // IOW, it's not supported by Nvidia ForceWare <= 352.21, also pocl segfaults, also on apple + // IOW, it's not supported by Nvidia drivers <= 352.21, also pocl segfaults, also on apple // How's that possible, OpenCL 1.2 support is advertised?? // We need to workaround... @@ -2788,62 +2787,57 @@ static void run_copy (hc_device_param_t *device_param, const uint pws_cnt) } } -static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const int repeat) +static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) { const u32 kernel_power = device_param->device_processors * device_param->kernel_threads * kernel_accel; - device_param->kernel_params_buf32[26] = kernel_loops; - device_param->kernel_params_buf32[27] = kernel_loops; + device_param->kernel_params_buf32[25] = 0; + device_param->kernel_params_buf32[26] = kernel_loops; // not a bug, both need to be set + device_param->kernel_params_buf32[27] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes // init some fake words - for (u32 i = 0; i < kernel_power; i++) + if (data.hash_mode == 10700) { - device_param->pws_buf[i].i[0] = i; - device_param->pws_buf[i].i[1] = 0x01234567; - device_param->pws_buf[i].pw_len = 4 + (i & 3); - } + // hash mode 10700 hangs on length 0 (unlimited loop) - hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + for (u32 i = 0; i < kernel_power; i++) + { + device_param->pws_buf[i].i[0] = i; + device_param->pws_buf[i].i[1] = i + 0x01234567; + device_param->pws_buf[i].i[2] = i + 0x89abcdef; + device_param->pws_buf[i].i[3] = 0xffffffff; + device_param->pws_buf[i].pw_len = 4 + (i & 3); + } - if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL) - { - run_kernel_amp (device_param, kernel_power); - } + hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); - // caching run + if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL) + { + run_kernel_amp (device_param, kernel_power); + } + } if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL) { - run_kernel (KERN_RUN_1, device_param, kernel_power, false); + run_kernel (KERN_RUN_1, device_param, kernel_power, true); } else { - run_kernel (KERN_RUN_2, device_param, kernel_power, false); + run_kernel (KERN_RUN_2, device_param, kernel_power, true); } - // now user repeats - - for (int i = 0; i < repeat; i++) - { - if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL) - { - run_kernel (KERN_RUN_1, device_param, kernel_power, true); - } - else - { - run_kernel (KERN_RUN_2, device_param, kernel_power, true); - } - } - - const double exec_ms_prev = get_avg_exec_time (device_param, repeat); + const double exec_ms_prev = get_avg_exec_time (device_param, 1); // reset fake words - memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t)); + if (data.hash_mode == 10700) + { + memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t)); - hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); - hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + } return exec_ms_prev; } @@ -2861,183 +2855,96 @@ static void autotune (hc_device_param_t *device_param) u32 kernel_accel = kernel_accel_min; u32 kernel_loops = kernel_loops_min; - // steps - #define STEPS_CNT 10 - #define STEPS_ACCEL_CNT (STEPS_CNT + 2) - #define STEPS_LOOPS_CNT (STEPS_CNT + 2) + #define MAX_RETRIES 1 - u32 steps_accel[STEPS_ACCEL_CNT]; - u32 steps_loops[STEPS_LOOPS_CNT]; + double exec_ms_final = 0; - for (int i = 0; i < STEPS_ACCEL_CNT; i++) - { - steps_accel[i] = 1 << i; - } + // first find out highest kernel-loops that stays below target_ms - for (int i = 0; i < STEPS_LOOPS_CNT; i++) + for (kernel_loops = kernel_loops_max; kernel_loops > kernel_loops_min; kernel_loops >>= 1) { - steps_loops[i] = 1 << i; - } - - steps_accel[STEPS_CNT + 0] = kernel_accel_min; - steps_accel[STEPS_CNT + 1] = kernel_accel_max; - - steps_loops[STEPS_CNT + 0] = kernel_loops_min; - steps_loops[STEPS_CNT + 1] = kernel_loops_max; + double exec_ms_best = try_run (device_param, kernel_accel_min, kernel_loops); - qsort (steps_accel, STEPS_ACCEL_CNT, sizeof (u32), sort_by_u32); - qsort (steps_loops, STEPS_LOOPS_CNT, sizeof (u32), sort_by_u32); - - // find out highest kernel-loops that stays below target_ms, we can use it later for multiplication as this is a linear function + for (int i = 0; i < MAX_RETRIES; i++) + { + const double exec_ms_cur = try_run (device_param, kernel_accel_min, kernel_loops); - u32 kernel_loops_tmp; + exec_ms_best = MIN (exec_ms_best, exec_ms_cur); + } - for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1) - { - const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1); + if (exec_ms_final == 0) exec_ms_final = exec_ms_best; - if (exec_ms < target_ms) break; + if (exec_ms_best < target_ms) break; } - // kernel-accel + // now the same for kernel-accel but with the new kernel-loops from previous loop set if (kernel_accel_min < kernel_accel_max) { - double e_best = 0; - - for (int i = 0; i < STEPS_ACCEL_CNT; i++) + for (int i = 0; i < STEPS_CNT; i++) { - const u32 kernel_accel_try = steps_accel[i]; + const u32 kernel_accel_try = 1 << i; if (kernel_accel_try < kernel_accel_min) continue; if (kernel_accel_try > kernel_accel_max) break; - const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1); - - if (exec_ms > target_ms) break; + double exec_ms_best = try_run (device_param, kernel_accel_try, kernel_loops); - const double e = kernel_accel_try / exec_ms; - - if (e > e_best) + for (int i = 0; i < MAX_RETRIES; i++) { - kernel_accel = kernel_accel_try; + const double exec_ms_cur = try_run (device_param, kernel_accel_try, kernel_loops); - e_best = e; + exec_ms_best = MIN (exec_ms_best, exec_ms_cur); } - } - } - // kernel-loops final + if (exec_ms_best > target_ms) break; - if (kernel_loops_min < kernel_loops_max) - { - double e_best = 0; - - for (int i = 0; i < STEPS_LOOPS_CNT; i++) - { - const u32 kernel_loops_try = steps_loops[i]; - - if (kernel_loops_try < kernel_loops_min) continue; - if (kernel_loops_try > kernel_loops_max) break; - - const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1); - - if (exec_ms > target_ms) break; + exec_ms_final = exec_ms_best; - const double e = kernel_loops_try / exec_ms; - - if (e > e_best) - { - kernel_loops = kernel_loops_try; - - e_best = e; - } + kernel_accel = kernel_accel_try; } } - // final balance - - u32 kernel_accel_best = kernel_accel; - u32 kernel_loops_best = kernel_loops; - - u32 exec_best = -1; - - if ((kernel_accel_min < kernel_accel_max) || (kernel_loops_min < kernel_loops_max)) - { - const double exec_ms = try_run (device_param, kernel_accel_best, kernel_loops_best, 1); - - exec_best = exec_ms; - } - - // reset + // sometimes we're in a bad situation that the algorithm is so slow that we can not + // create enough kernel_accel to do both, keep the gpu busy and stay below target_ms. + // however, we need to have a minimum kernel_accel of 8. + // luckily, at this level of workload, it became a linear function - if (kernel_accel_min < kernel_accel_max) + while (kernel_accel < 8) { - u32 kernel_accel_try = kernel_accel; - u32 kernel_loops_try = kernel_loops; - - for (int i = 0; i < 2; i++) - { - kernel_accel_try >>= 1; - kernel_loops_try <<= 1; + const u32 kernel_accel_try = kernel_accel * 2; + const u32 kernel_loops_try = kernel_loops / 2; - if (kernel_accel_try < kernel_accel_min) break; - if (kernel_loops_try > kernel_loops_max) break; - - const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1); - - if (exec_ms < exec_best) - { - kernel_accel_best = kernel_accel_try; - kernel_loops_best = kernel_loops_try; + if (kernel_accel_try > kernel_accel_max) break; + if (kernel_loops_try < kernel_loops_min) break; - exec_best = exec_ms; - } - } + kernel_accel = kernel_accel_try; + kernel_loops = kernel_loops_try; } - // reset + // finally there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel + // in such a case the above function would not create any change + // we'll use the runtime to find out if we're allow to do last improvement - if (kernel_loops_min < kernel_loops_max) + if (exec_ms_final > 0) { - u32 kernel_accel_try = kernel_accel; - u32 kernel_loops_try = kernel_loops; - - for (int i = 0; i < 2; i++) + if (exec_ms_final < target_ms) { - kernel_accel_try <<= 1; - kernel_loops_try >>= 1; + const double exec_left = target_ms / exec_ms_final; - if (kernel_accel_try > kernel_accel_max) break; - if (kernel_loops_try < kernel_loops_min) break; + const double accel_left = kernel_accel_max / kernel_accel; - const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1); + const double exec_accel_min = MIN (exec_left, accel_left); - if (exec_ms < exec_best) + if (exec_accel_min >= 2) { - kernel_accel_best = kernel_accel_try; - kernel_loops_best = kernel_loops_try; - - exec_best = exec_ms; + kernel_accel *= exec_accel_min; } } } - // because of the balance we may have some free space left! - - const int exec_left = target_ms / exec_best; - - const int accel_left = kernel_accel_max / kernel_accel_best; - - const int exec_accel_min = MIN (exec_left, accel_left); - - if (exec_accel_min) - { - kernel_accel_best *= exec_accel_min; - } - // reset timer device_param->exec_pos = 0; @@ -3046,9 +2953,6 @@ static void autotune (hc_device_param_t *device_param) // store - kernel_accel = kernel_accel_best; - kernel_loops = kernel_loops_best; - device_param->kernel_accel = kernel_accel; device_param->kernel_loops = kernel_loops; @@ -3064,12 +2968,11 @@ static void autotune (hc_device_param_t *device_param) log_info ("Device #%u: autotuned kernel-accel to %u\n" "Device #%u: autotuned kernel-loops to %u\n", - device_param->device_id + 1, - kernel_accel, - device_param->device_id + 1, - kernel_loops); + device_param->device_id + 1, kernel_accel, + device_param->device_id + 1, kernel_loops); fprintf (stdout, "%s", PROMPT); + fflush (stdout); } @@ -3320,41 +3223,13 @@ static void run_cracker (hc_device_param_t *device_param, const uint pws_cnt) hc_clEnqueueCopyBuffer (data.ocl, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (comb_t), 0, NULL, NULL); } - choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt); - if (data.benchmark == 1) { - double exec_ms_avg_prev = get_avg_exec_time (device_param, EXEC_CACHE); - - // a few caching rounds - - for (u32 i = 0; i < 2; i++) - { - hc_timer_set (&device_param->timer_speed); - - choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt); - - double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE); - - exec_ms_avg_prev = exec_ms_avg; - } - - // benchmark_repeats became a maximum possible repeats - - for (u32 i = 2; i < data.benchmark_repeats; i++) - { - hc_timer_set (&device_param->timer_speed); - - choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt); - - double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE); - - if ((exec_ms_avg_prev / exec_ms_avg) < 1.001) break; - - exec_ms_avg_prev = exec_ms_avg; - } + hc_timer_set (&device_param->timer_speed); } + choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt); + if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint (); if (data.devices_status == STATUS_CRACKED) break; @@ -5374,7 +5249,6 @@ int main (int argc, char **argv) uint version = VERSION; uint quiet = QUIET; uint benchmark = BENCHMARK; - uint benchmark_repeats = BENCHMARK_REPEATS; uint show = SHOW; uint left = LEFT; uint username = USERNAME; @@ -5472,7 +5346,6 @@ int main (int argc, char **argv) #define IDX_FORCE 0xff08 #define IDX_RUNTIME 0xff09 #define IDX_BENCHMARK 'b' - #define IDX_BENCHMARK_REPEATS 0xff78 #define IDX_HASH_MODE 'm' #define IDX_ATTACK_MODE 'a' #define IDX_RP_FILE 'r' @@ -5551,7 +5424,6 @@ int main (int argc, char **argv) {"outfile-check-dir", required_argument, 0, IDX_OUTFILE_CHECK_DIR}, {"force", no_argument, 0, IDX_FORCE}, {"benchmark", no_argument, 0, IDX_BENCHMARK}, - {"benchmark-repeats", required_argument, 0, IDX_BENCHMARK_REPEATS}, {"restore", no_argument, 0, IDX_RESTORE}, {"restore-disable", no_argument, 0, IDX_RESTORE_DISABLE}, {"status", no_argument, 0, IDX_STATUS}, @@ -5861,7 +5733,6 @@ int main (int argc, char **argv) case IDX_LIMIT: limit = atoll (optarg); break; case IDX_KEYSPACE: keyspace = 1; break; case IDX_BENCHMARK: benchmark = 1; break; - case IDX_BENCHMARK_REPEATS: benchmark_repeats = atoi (optarg); break; case IDX_RESTORE: break; case IDX_RESTORE_DISABLE: restore_disable = 1; break; case IDX_STATUS: status = 1; break; @@ -6604,7 +6475,6 @@ int main (int argc, char **argv) data.rp_gen_seed = rp_gen_seed; data.force = force; data.benchmark = benchmark; - data.benchmark_repeats = benchmark_repeats; data.skip = skip; data.limit = limit; #if defined(HAVE_HWMON) && defined(HAVE_ADL) @@ -6679,7 +6549,6 @@ int main (int argc, char **argv) logfile_top_uint (attack_mode); logfile_top_uint (attack_kern); logfile_top_uint (benchmark); - logfile_top_uint (benchmark_repeats); logfile_top_uint (bitmap_min); logfile_top_uint (bitmap_max); logfile_top_uint (debug_mode); @@ -10338,7 +10207,7 @@ int main (int argc, char **argv) attack_exec = ATTACK_EXEC_INSIDE_KERNEL; opts_type = OPTS_TYPE_PT_GENERATE_BE | OPTS_TYPE_PT_UNICODE - | OPTS_TYPE_PT_ADD80; + | OPTS_TYPE_PT_ADD80; kern_type = KERN_TYPE_PSTOKEN; dgst_size = DGST_SIZE_4_5; parse_func = pstoken_parse_hash;