From 006f5252b7f47548ebf9ce4cf783b0890b32e0ee Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 7 May 2016 13:15:21 +0200 Subject: [PATCH] Optimize a few modes for hashcat_tuning.hctab for budget NV cards Little experiment with MD4 based optimizations on -m 900 -m 1000 and -m 1100 Fix benchmark in case user fixes -u and -n values --- OpenCL/m00900_a3.cl | 12 ++--- OpenCL/m01100_a3.cl | 112 ++++++++++++++++++++++--------------------- hashcat_tuning.hctab | 18 ++++--- src/oclHashcat.c | 14 +++++- 4 files changed, 82 insertions(+), 74 deletions(-) diff --git a/OpenCL/m00900_a3.cl b/OpenCL/m00900_a3.cl index 78153d69b..5170c0ecd 100644 --- a/OpenCL/m00900_a3.cl +++ b/OpenCL/m00900_a3.cl @@ -331,15 +331,9 @@ void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r MD4_STEP0(MD4_Go, b, c, d, a, G_wdc01, MD4S13); MD4_STEP0(MD4_Go, a, b, c, d, G_w2c01, MD4S10); MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11); - MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12); - - if (MATCHES_NONE_VV (c, pre_c)) continue; - - MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13); - MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10); - - if (MATCHES_NONE_VV (a, pre_a)) continue; - + MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12); if (MATCHES_NONE_VV (c, pre_c)) continue; + MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13); if (MATCHES_NONE_VV (b, pre_b)) continue; + MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10); if (MATCHES_NONE_VV (a, pre_a)) continue; MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11); MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12); MD4_STEP0(MD4_Go, b, c, d, a, G_wfc01, MD4S13); diff --git a/OpenCL/m01100_a3.cl b/OpenCL/m01100_a3.cl index 8c6bad61e..cfdfa27fb 100644 --- a/OpenCL/m01100_a3.cl +++ b/OpenCL/m01100_a3.cl @@ -33,24 +33,24 @@ void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r * salt */ - u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; + #define salt_buf00 salt_bufs[salt_pos].salt_buf[ 0] + #define salt_buf01 salt_bufs[salt_pos].salt_buf[ 1] + #define salt_buf02 salt_bufs[salt_pos].salt_buf[ 2] + #define salt_buf03 salt_bufs[salt_pos].salt_buf[ 3] + #define salt_buf04 salt_bufs[salt_pos].salt_buf[ 4] + #define salt_buf05 salt_bufs[salt_pos].salt_buf[ 5] + #define salt_buf06 salt_bufs[salt_pos].salt_buf[ 6] + #define salt_buf07 salt_bufs[salt_pos].salt_buf[ 7] + #define salt_buf08 salt_bufs[salt_pos].salt_buf[ 8] + #define salt_buf09 salt_bufs[salt_pos].salt_buf[ 9] + #define salt_buf10 salt_bufs[salt_pos].salt_buf[10] + #define salt_buf11 salt_bufs[salt_pos].salt_buf[11] + #define salt_buf12 salt_bufs[salt_pos].salt_buf[12] + #define salt_buf13 salt_bufs[salt_pos].salt_buf[13] + #define salt_buf14 salt_bufs[salt_pos].salt_buf[14] + #define salt_buf15 salt_bufs[salt_pos].salt_buf[15] - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; + #define salt_len salt_bufs[salt_pos].salt_len /** * base @@ -119,6 +119,8 @@ void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r const u32x w0 = w0l | w0r; + barrier (CLK_LOCAL_MEM_FENCE); + u32x a = MD4M_A; u32x b = MD4M_B; u32x c = MD4M_C; @@ -189,16 +191,16 @@ void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r w0_t[1] = b; w0_t[2] = c; w0_t[3] = d; - w1_t[0] = salt_buf0[0]; - w1_t[1] = salt_buf0[1]; - w1_t[2] = salt_buf0[2]; - w1_t[3] = salt_buf0[3]; - w2_t[0] = salt_buf1[0]; - w2_t[1] = salt_buf1[1]; - w2_t[2] = salt_buf1[2]; - w2_t[3] = salt_buf1[3]; - w3_t[0] = salt_buf2[0]; - w3_t[1] = salt_buf2[1]; + w1_t[0] = salt_buf00; + w1_t[1] = salt_buf01; + w1_t[2] = salt_buf02; + w1_t[3] = salt_buf03; + w2_t[0] = salt_buf04; + w2_t[1] = salt_buf05; + w2_t[2] = salt_buf06; + w2_t[3] = salt_buf07; + w3_t[0] = salt_buf08; + w3_t[1] = salt_buf09; w3_t[2] = (16 + salt_len) * 8; w3_t[3] = 0; @@ -275,24 +277,24 @@ void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r * salt */ - u32 salt_buf0[4]; - u32 salt_buf1[4]; - u32 salt_buf2[4]; + #define salt_buf00 salt_bufs[salt_pos].salt_buf[ 0] + #define salt_buf01 salt_bufs[salt_pos].salt_buf[ 1] + #define salt_buf02 salt_bufs[salt_pos].salt_buf[ 2] + #define salt_buf03 salt_bufs[salt_pos].salt_buf[ 3] + #define salt_buf04 salt_bufs[salt_pos].salt_buf[ 4] + #define salt_buf05 salt_bufs[salt_pos].salt_buf[ 5] + #define salt_buf06 salt_bufs[salt_pos].salt_buf[ 6] + #define salt_buf07 salt_bufs[salt_pos].salt_buf[ 7] + #define salt_buf08 salt_bufs[salt_pos].salt_buf[ 8] + #define salt_buf09 salt_bufs[salt_pos].salt_buf[ 9] + #define salt_buf10 salt_bufs[salt_pos].salt_buf[10] + #define salt_buf11 salt_bufs[salt_pos].salt_buf[11] + #define salt_buf12 salt_bufs[salt_pos].salt_buf[12] + #define salt_buf13 salt_bufs[salt_pos].salt_buf[13] + #define salt_buf14 salt_bufs[salt_pos].salt_buf[14] + #define salt_buf15 salt_bufs[salt_pos].salt_buf[15] - salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; - salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; - salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; - salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; - salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; - salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; - salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; - salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; - salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; - salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; - salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; - salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; - - const u32 salt_len = salt_bufs[salt_pos].salt_len; + #define salt_len salt_bufs[salt_pos].salt_len /** * base @@ -373,6 +375,8 @@ void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r const u32x w0 = w0l | w0r; + barrier (CLK_LOCAL_MEM_FENCE); + u32x a = MD4M_A; u32x b = MD4M_B; u32x c = MD4M_C; @@ -443,16 +447,16 @@ void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_r w0_t[1] = b; w0_t[2] = c; w0_t[3] = d; - w1_t[0] = salt_buf0[0]; - w1_t[1] = salt_buf0[1]; - w1_t[2] = salt_buf0[2]; - w1_t[3] = salt_buf0[3]; - w2_t[0] = salt_buf1[0]; - w2_t[1] = salt_buf1[1]; - w2_t[2] = salt_buf1[2]; - w2_t[3] = salt_buf1[3]; - w3_t[0] = salt_buf2[0]; - w3_t[1] = salt_buf2[1]; + w1_t[0] = salt_buf00; + w1_t[1] = salt_buf01; + w1_t[2] = salt_buf02; + w1_t[3] = salt_buf03; + w2_t[0] = salt_buf04; + w2_t[1] = salt_buf05; + w2_t[2] = salt_buf06; + w2_t[3] = salt_buf07; + w3_t[0] = salt_buf08; + w3_t[1] = salt_buf09; w3_t[2] = (16 + salt_len) * 8; w3_t[3] = 0; diff --git a/hashcat_tuning.hctab b/hashcat_tuning.hctab index 43391da80..7d5befc0a 100644 --- a/hashcat_tuning.hctab +++ b/hashcat_tuning.hctab @@ -64,17 +64,17 @@ DEVICE_TYPE_CPU * * N A #Device Attack Hash Vector Kernel Kernel #Name Mode Type Width Accel Loops -ALIAS_nv_budget * 0 2 A A -ALIAS_nv_budget * 11 2 A A -ALIAS_nv_budget * 12 2 A A +ALIAS_nv_budget * 0 4 A A +ALIAS_nv_budget * 11 4 A A +ALIAS_nv_budget * 12 4 A A ALIAS_nv_budget * 21 2 A A ALIAS_nv_budget * 22 2 A A ALIAS_nv_budget * 23 2 A A -ALIAS_nv_budget * 200 2 A A +ALIAS_nv_budget * 200 4 A A ALIAS_nv_budget * 400 2 A A -ALIAS_nv_budget * 900 2 A A -ALIAS_nv_budget * 1000 2 A A -ALIAS_nv_budget * 1100 2 A A +ALIAS_nv_budget * 900 4 A A +ALIAS_nv_budget * 1000 4 A A +ALIAS_nv_budget * 1100 4 A A ALIAS_nv_budget * 2400 2 A A ALIAS_nv_budget * 2410 2 A A ALIAS_nv_budget * 2600 2 A A @@ -85,11 +85,9 @@ ALIAS_nv_budget * 2811 2 A ALIAS_nv_budget * 3711 2 A A ALIAS_nv_budget * 5100 2 A A ALIAS_nv_budget * 5500 2 A A -ALIAS_nv_budget * 6000 2 A A -ALIAS_nv_budget * 9900 2 A A +ALIAS_nv_budget * 9900 4 A A ALIAS_nv_budget * 10100 2 A A ALIAS_nv_budget * 11000 2 A A -ALIAS_nv_budget * 11400 2 A A #Device Attack Hash Vector Kernel Kernel #Name Mode Type Width Accel Loops diff --git a/src/oclHashcat.c b/src/oclHashcat.c index bf45fe02f..ea3c9c349 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -2859,7 +2859,19 @@ static void autotune (hc_device_param_t *device_param) #define MAX_RETRIES 1 - double exec_ms_final = 0; + if ((kernel_loops_min == kernel_loops_max) || (kernel_accel_min == kernel_accel_max)) + { + // we do this in case the user specified a fixed -u and -n on the commandline + // so we have a cached kernel for benchmark + + try_run (device_param, kernel_accel, kernel_loops); + try_run (device_param, kernel_accel, kernel_loops); + try_run (device_param, kernel_accel, kernel_loops); + try_run (device_param, kernel_accel, kernel_loops); + try_run (device_param, kernel_accel, kernel_loops); + } + + double exec_ms_final = try_run (device_param, kernel_accel, kernel_loops); // first find out highest kernel-loops that stays below target_ms