Simplify auto-tuning and benchmark routines

Decrease the time it takes to run a benchmark
Removed --benchmark-repeat, it creates no advantage
Fix some wording related to drivers
Dropped special 64-bit rotate() handling for NV
Cleanup SHA384
Cleanup try_run()
pull/330/head
Jens Steube 8 years ago
parent fc89a04737
commit 72e3821a4c

@ -116,10 +116,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con
ROUND_STEP (0);
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
ROUND_EXPAND (); ROUND_STEP (64);
#pragma unroll
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
}
/* rev
digest[0] += a;

@ -114,10 +114,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con
ROUND_STEP (0);
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
ROUND_EXPAND (); ROUND_STEP (64);
#pragma unroll
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
}
/* rev
digest[0] += a;

@ -114,10 +114,11 @@ void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], con
ROUND_STEP (0);
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
ROUND_EXPAND (); ROUND_STEP (64);
#pragma unroll
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
}
/* rev
digest[0] += a;

@ -311,40 +311,10 @@ inline u32 rotl32_S (const u32 a, const u32 n)
return rotate (a, n);
}
#if CUDA_ARCH >= 350
inline u64 rotr64_S (const u64 a, const u32 n)
{
u32 il;
u32 ir;
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
u32 tl;
u32 tr;
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
u64 r;
asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
return r;
}
#else
inline u64 rotr64_S (const u64 a, const u32 n)
{
return rotate (a, (u64) 64 - n);
}
#endif
inline u64 rotl64_S (const u64 a, const u32 n)
{
@ -390,335 +360,14 @@ inline u32x rotl32 (const u32x a, const u32 n)
return rotate (a, n);
}
#if CUDA_ARCH >= 350
inline u64x rotr64 (const u64x a, const u32 n)
{
u64x r;
u32 il;
u32 ir;
u32 tl;
u32 tr;
#if VECT_SIZE == 1
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
#endif
#if VECT_SIZE >= 2
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s0));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s0) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s1));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s1) : "r"(tl), "r"(tr));
}
#endif
#if VECT_SIZE >= 4
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s2));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s2) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s3));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s3) : "r"(tl), "r"(tr));
}
#endif
#if VECT_SIZE >= 8
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s4));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s4) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s5));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s5) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s6));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s6) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s7));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s7) : "r"(tl), "r"(tr));
}
#endif
#if VECT_SIZE >= 16
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s8));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s8) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.s9));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.s9) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sa));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sa) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sb));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sb) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sc));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sc) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sd));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sd) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.se));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.se) : "r"(tl), "r"(tr));
}
{
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a.sf));
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
asm ("mov.b64 %0, {%1, %2};" : "=l"(r.sf) : "r"(tl), "r"(tr));
}
#endif
return r;
}
#else
inline u64x rotr64 (const u64x a, const u32 n)
{
return rotate (a, (u64) 64 - n);
}
#endif
inline u64x rotl64 (const u64x a, const u32 n)
{
return rotr64 (a, (u64) 64 - n);
return rotate (a, (u64) n);
}
inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)

@ -118,3 +118,4 @@ It combines all features of all hashcat projects in one project.
- Added -cl-std=CL1.1 to all kernel build options
- Created environment variable to inform NVidia OpenCL runtime to not create its own kernel cache
- Created environment variable to inform pocl OpenCL runtime to not create its own kernel cache
- Dropped special 64-bit rotate() handling for NV, it seems that they've added it to their OpenCL runtime

@ -1,9 +1,9 @@
oclHashcat v2.10
================
AMD users require Catalyst 14.9 or later (recommended 15.12 or later)
AMD users require AMD drivers 14.9 or later (recommended 15.12 or later)
Intel users require Intel OpenCL Runtime 14.2 or later (recommended 15.1 or later)
NVidia users require ForceWare 346.59 or later (recommended 358.09 or later)
NVidia users require NVidia drivers 346.59 or later (recommended 361.x or later)
##
## Features

@ -33,7 +33,6 @@ double TARGET_MS_PROFILE[3] = { 8, 16, 96 };
#define MARKOV_DISABLE 0
#define MARKOV_CLASSIC 0
#define BENCHMARK 0
#define BENCHMARK_REPEATS 100
#define RESTORE 0
#define RESTORE_TIMER 60
#define RESTORE_DISABLE 0
@ -2612,7 +2611,7 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const
if (rc != 0)
{
// NOTE: clEnqueueFillBuffer () always fails with -59
// IOW, it's not supported by Nvidia ForceWare <= 352.21, also pocl segfaults, also on apple
// IOW, it's not supported by Nvidia drivers <= 352.21, also pocl segfaults, also on apple
// How's that possible, OpenCL 1.2 support is advertised??
// We need to workaround...
@ -2788,62 +2787,57 @@ static void run_copy (hc_device_param_t *device_param, const uint pws_cnt)
}
}
static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const int repeat)
static double try_run (hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops)
{
const u32 kernel_power = device_param->device_processors * device_param->kernel_threads * kernel_accel;
device_param->kernel_params_buf32[26] = kernel_loops;
device_param->kernel_params_buf32[27] = kernel_loops;
device_param->kernel_params_buf32[25] = 0;
device_param->kernel_params_buf32[26] = kernel_loops; // not a bug, both need to be set
device_param->kernel_params_buf32[27] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
// init some fake words
for (u32 i = 0; i < kernel_power; i++)
if (data.hash_mode == 10700)
{
device_param->pws_buf[i].i[0] = i;
device_param->pws_buf[i].i[1] = 0x01234567;
device_param->pws_buf[i].pw_len = 4 + (i & 3);
}
// hash mode 10700 hangs on length 0 (unlimited loop)
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
for (u32 i = 0; i < kernel_power; i++)
{
device_param->pws_buf[i].i[0] = i;
device_param->pws_buf[i].i[1] = i + 0x01234567;
device_param->pws_buf[i].i[2] = i + 0x89abcdef;
device_param->pws_buf[i].i[3] = 0xffffffff;
device_param->pws_buf[i].pw_len = 4 + (i & 3);
}
if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
{
run_kernel_amp (device_param, kernel_power);
}
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
// caching run
if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
{
run_kernel_amp (device_param, kernel_power);
}
}
if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
run_kernel (KERN_RUN_1, device_param, kernel_power, false);
run_kernel (KERN_RUN_1, device_param, kernel_power, true);
}
else
{
run_kernel (KERN_RUN_2, device_param, kernel_power, false);
run_kernel (KERN_RUN_2, device_param, kernel_power, true);
}
// now user repeats
for (int i = 0; i < repeat; i++)
{
if (data.attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
{
run_kernel (KERN_RUN_1, device_param, kernel_power, true);
}
else
{
run_kernel (KERN_RUN_2, device_param, kernel_power, true);
}
}
const double exec_ms_prev = get_avg_exec_time (device_param, repeat);
const double exec_ms_prev = get_avg_exec_time (device_param, 1);
// reset fake words
memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
if (data.hash_mode == 10700)
{
memset (device_param->pws_buf, 0, kernel_power * sizeof (pw_t));
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
}
return exec_ms_prev;
}
@ -2861,183 +2855,96 @@ static void autotune (hc_device_param_t *device_param)
u32 kernel_accel = kernel_accel_min;
u32 kernel_loops = kernel_loops_min;
// steps
#define STEPS_CNT 10
#define STEPS_ACCEL_CNT (STEPS_CNT + 2)
#define STEPS_LOOPS_CNT (STEPS_CNT + 2)
#define MAX_RETRIES 1
u32 steps_accel[STEPS_ACCEL_CNT];
u32 steps_loops[STEPS_LOOPS_CNT];
double exec_ms_final = 0;
for (int i = 0; i < STEPS_ACCEL_CNT; i++)
{
steps_accel[i] = 1 << i;
}
// first find out highest kernel-loops that stays below target_ms
for (int i = 0; i < STEPS_LOOPS_CNT; i++)
for (kernel_loops = kernel_loops_max; kernel_loops > kernel_loops_min; kernel_loops >>= 1)
{
steps_loops[i] = 1 << i;
}
steps_accel[STEPS_CNT + 0] = kernel_accel_min;
steps_accel[STEPS_CNT + 1] = kernel_accel_max;
steps_loops[STEPS_CNT + 0] = kernel_loops_min;
steps_loops[STEPS_CNT + 1] = kernel_loops_max;
double exec_ms_best = try_run (device_param, kernel_accel_min, kernel_loops);
qsort (steps_accel, STEPS_ACCEL_CNT, sizeof (u32), sort_by_u32);
qsort (steps_loops, STEPS_LOOPS_CNT, sizeof (u32), sort_by_u32);
// find out highest kernel-loops that stays below target_ms, we can use it later for multiplication as this is a linear function
for (int i = 0; i < MAX_RETRIES; i++)
{
const double exec_ms_cur = try_run (device_param, kernel_accel_min, kernel_loops);
u32 kernel_loops_tmp;
exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
}
for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
{
const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
if (exec_ms_final == 0) exec_ms_final = exec_ms_best;
if (exec_ms < target_ms) break;
if (exec_ms_best < target_ms) break;
}
// kernel-accel
// now the same for kernel-accel but with the new kernel-loops from previous loop set
if (kernel_accel_min < kernel_accel_max)
{
double e_best = 0;
for (int i = 0; i < STEPS_ACCEL_CNT; i++)
for (int i = 0; i < STEPS_CNT; i++)
{
const u32 kernel_accel_try = steps_accel[i];
const u32 kernel_accel_try = 1 << i;
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
if (exec_ms > target_ms) break;
double exec_ms_best = try_run (device_param, kernel_accel_try, kernel_loops);
const double e = kernel_accel_try / exec_ms;
if (e > e_best)
for (int i = 0; i < MAX_RETRIES; i++)
{
kernel_accel = kernel_accel_try;
const double exec_ms_cur = try_run (device_param, kernel_accel_try, kernel_loops);
e_best = e;
exec_ms_best = MIN (exec_ms_best, exec_ms_cur);
}
}
}
// kernel-loops final
if (exec_ms_best > target_ms) break;
if (kernel_loops_min < kernel_loops_max)
{
double e_best = 0;
for (int i = 0; i < STEPS_LOOPS_CNT; i++)
{
const u32 kernel_loops_try = steps_loops[i];
if (kernel_loops_try < kernel_loops_min) continue;
if (kernel_loops_try > kernel_loops_max) break;
const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
if (exec_ms > target_ms) break;
exec_ms_final = exec_ms_best;
const double e = kernel_loops_try / exec_ms;
if (e > e_best)
{
kernel_loops = kernel_loops_try;
e_best = e;
}
kernel_accel = kernel_accel_try;
}
}
// final balance
u32 kernel_accel_best = kernel_accel;
u32 kernel_loops_best = kernel_loops;
u32 exec_best = -1;
if ((kernel_accel_min < kernel_accel_max) || (kernel_loops_min < kernel_loops_max))
{
const double exec_ms = try_run (device_param, kernel_accel_best, kernel_loops_best, 1);
exec_best = exec_ms;
}
// reset
// sometimes we're in a bad situation that the algorithm is so slow that we can not
// create enough kernel_accel to do both, keep the gpu busy and stay below target_ms.
// however, we need to have a minimum kernel_accel of 8.
// luckily, at this level of workload, it became a linear function
if (kernel_accel_min < kernel_accel_max)
while (kernel_accel < 8)
{
u32 kernel_accel_try = kernel_accel;
u32 kernel_loops_try = kernel_loops;
for (int i = 0; i < 2; i++)
{
kernel_accel_try >>= 1;
kernel_loops_try <<= 1;
const u32 kernel_accel_try = kernel_accel * 2;
const u32 kernel_loops_try = kernel_loops / 2;
if (kernel_accel_try < kernel_accel_min) break;
if (kernel_loops_try > kernel_loops_max) break;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
if (exec_ms < exec_best)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
if (kernel_accel_try > kernel_accel_max) break;
if (kernel_loops_try < kernel_loops_min) break;
exec_best = exec_ms;
}
}
kernel_accel = kernel_accel_try;
kernel_loops = kernel_loops_try;
}
// reset
// finally there's a chance that we have a fixed kernel_loops but not a fixed kernel_accel
// in such a case the above function would not create any change
// we'll use the runtime to find out if we're allow to do last improvement
if (kernel_loops_min < kernel_loops_max)
if (exec_ms_final > 0)
{
u32 kernel_accel_try = kernel_accel;
u32 kernel_loops_try = kernel_loops;
for (int i = 0; i < 2; i++)
if (exec_ms_final < target_ms)
{
kernel_accel_try <<= 1;
kernel_loops_try >>= 1;
const double exec_left = target_ms / exec_ms_final;
if (kernel_accel_try > kernel_accel_max) break;
if (kernel_loops_try < kernel_loops_min) break;
const double accel_left = kernel_accel_max / kernel_accel;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
const double exec_accel_min = MIN (exec_left, accel_left);
if (exec_ms < exec_best)
if (exec_accel_min >= 2)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
exec_best = exec_ms;
kernel_accel *= exec_accel_min;
}
}
}
// because of the balance we may have some free space left!
const int exec_left = target_ms / exec_best;
const int accel_left = kernel_accel_max / kernel_accel_best;
const int exec_accel_min = MIN (exec_left, accel_left);
if (exec_accel_min)
{
kernel_accel_best *= exec_accel_min;
}
// reset timer
device_param->exec_pos = 0;
@ -3046,9 +2953,6 @@ static void autotune (hc_device_param_t *device_param)
// store
kernel_accel = kernel_accel_best;
kernel_loops = kernel_loops_best;
device_param->kernel_accel = kernel_accel;
device_param->kernel_loops = kernel_loops;
@ -3064,12 +2968,11 @@ static void autotune (hc_device_param_t *device_param)
log_info ("Device #%u: autotuned kernel-accel to %u\n"
"Device #%u: autotuned kernel-loops to %u\n",
device_param->device_id + 1,
kernel_accel,
device_param->device_id + 1,
kernel_loops);
device_param->device_id + 1, kernel_accel,
device_param->device_id + 1, kernel_loops);
fprintf (stdout, "%s", PROMPT);
fflush (stdout);
}
@ -3320,41 +3223,13 @@ static void run_cracker (hc_device_param_t *device_param, const uint pws_cnt)
hc_clEnqueueCopyBuffer (data.ocl, device_param->command_queue, device_param->d_combs, device_param->d_combs_c, 0, 0, innerloop_left * sizeof (comb_t), 0, NULL, NULL);
}
choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
if (data.benchmark == 1)
{
double exec_ms_avg_prev = get_avg_exec_time (device_param, EXEC_CACHE);
// a few caching rounds
for (u32 i = 0; i < 2; i++)
{
hc_timer_set (&device_param->timer_speed);
choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
exec_ms_avg_prev = exec_ms_avg;
}
// benchmark_repeats became a maximum possible repeats
for (u32 i = 2; i < data.benchmark_repeats; i++)
{
hc_timer_set (&device_param->timer_speed);
choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
double exec_ms_avg = get_avg_exec_time (device_param, EXEC_CACHE);
if ((exec_ms_avg_prev / exec_ms_avg) < 1.001) break;
exec_ms_avg_prev = exec_ms_avg;
}
hc_timer_set (&device_param->timer_speed);
}
choose_kernel (device_param, data.attack_exec, data.attack_mode, data.opts_type, salt_buf, highest_pw_len, pws_cnt);
if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint ();
if (data.devices_status == STATUS_CRACKED) break;
@ -5374,7 +5249,6 @@ int main (int argc, char **argv)
uint version = VERSION;
uint quiet = QUIET;
uint benchmark = BENCHMARK;
uint benchmark_repeats = BENCHMARK_REPEATS;
uint show = SHOW;
uint left = LEFT;
uint username = USERNAME;
@ -5472,7 +5346,6 @@ int main (int argc, char **argv)
#define IDX_FORCE 0xff08
#define IDX_RUNTIME 0xff09
#define IDX_BENCHMARK 'b'
#define IDX_BENCHMARK_REPEATS 0xff78
#define IDX_HASH_MODE 'm'
#define IDX_ATTACK_MODE 'a'
#define IDX_RP_FILE 'r'
@ -5551,7 +5424,6 @@ int main (int argc, char **argv)
{"outfile-check-dir", required_argument, 0, IDX_OUTFILE_CHECK_DIR},
{"force", no_argument, 0, IDX_FORCE},
{"benchmark", no_argument, 0, IDX_BENCHMARK},
{"benchmark-repeats", required_argument, 0, IDX_BENCHMARK_REPEATS},
{"restore", no_argument, 0, IDX_RESTORE},
{"restore-disable", no_argument, 0, IDX_RESTORE_DISABLE},
{"status", no_argument, 0, IDX_STATUS},
@ -5861,7 +5733,6 @@ int main (int argc, char **argv)
case IDX_LIMIT: limit = atoll (optarg); break;
case IDX_KEYSPACE: keyspace = 1; break;
case IDX_BENCHMARK: benchmark = 1; break;
case IDX_BENCHMARK_REPEATS: benchmark_repeats = atoi (optarg); break;
case IDX_RESTORE: break;
case IDX_RESTORE_DISABLE: restore_disable = 1; break;
case IDX_STATUS: status = 1; break;
@ -6604,7 +6475,6 @@ int main (int argc, char **argv)
data.rp_gen_seed = rp_gen_seed;
data.force = force;
data.benchmark = benchmark;
data.benchmark_repeats = benchmark_repeats;
data.skip = skip;
data.limit = limit;
#if defined(HAVE_HWMON) && defined(HAVE_ADL)
@ -6679,7 +6549,6 @@ int main (int argc, char **argv)
logfile_top_uint (attack_mode);
logfile_top_uint (attack_kern);
logfile_top_uint (benchmark);
logfile_top_uint (benchmark_repeats);
logfile_top_uint (bitmap_min);
logfile_top_uint (bitmap_max);
logfile_top_uint (debug_mode);
@ -10338,7 +10207,7 @@ int main (int argc, char **argv)
attack_exec = ATTACK_EXEC_INSIDE_KERNEL;
opts_type = OPTS_TYPE_PT_GENERATE_BE
| OPTS_TYPE_PT_UNICODE
| OPTS_TYPE_PT_ADD80;
| OPTS_TYPE_PT_ADD80;
kern_type = KERN_TYPE_PSTOKEN;
dgst_size = DGST_SIZE_4_5;
parse_func = pstoken_parse_hash;

Loading…
Cancel
Save