From a3a16f676feba5ff8612d7efeede606cbec390b8 Mon Sep 17 00:00:00 2001 From: jsteube Date: Mon, 5 Feb 2018 17:18:58 +0100 Subject: [PATCH] OpenCL Kernels: Add a decompressing kernel and a compressing host code in order to reduce PCIe transfer time For details see https://hashcat.net/forum/thread-7267.html --- OpenCL/amp_a0.cl | 4 +- OpenCL/amp_a1.cl | 4 +- OpenCL/amp_a3.cl | 2 +- OpenCL/inc_common.cl | 33 +++++ OpenCL/inc_types.cl | 8 ++ docs/changes.txt | 1 + include/types.h | 23 ++- src/autotune.c | 17 --- src/dispatch.c | 6 +- src/opencl.c | 328 ++++++++++++++++++++++++++++++++++--------- src/selftest.c | 3 +- src/wordlist.c | 22 ++- 12 files changed, 354 insertions(+), 97 deletions(-) diff --git a/OpenCL/amp_a0.cl b/OpenCL/amp_a0.cl index 5557ba07c..9f8a5cb3f 100644 --- a/OpenCL/amp_a0.cl +++ b/OpenCL/amp_a0.cl @@ -17,9 +17,9 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __constant const if (rules_buf[0].cmds[0] == RULE_OP_MANGLE_NOOP && rules_buf[0].cmds[1] == 0) return; - pw_t pw = pws[gid]; + pw_t pw = pws_amp[gid]; pw.pw_len = apply_rules (rules_buf[0].cmds, pw.i, pw.pw_len); - pws_amp[gid] = pw; + pws[gid] = pw; } diff --git a/OpenCL/amp_a1.cl b/OpenCL/amp_a1.cl index dd896032f..ec8db8275 100644 --- a/OpenCL/amp_a1.cl +++ b/OpenCL/amp_a1.cl @@ -13,7 +13,7 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const ke if (gid >= gid_max) return; - pw_t pw = pws[gid]; + pw_t pw = pws_amp[gid]; pw_t comb = combs_buf[0]; @@ -39,5 +39,5 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const ke pw.pw_len = pw_len + comb_len; - pws_amp[gid] = pw; + pws[gid] = pw; } diff --git a/OpenCL/amp_a3.cl b/OpenCL/amp_a3.cl index 69db61604..e2b5d9095 100644 --- a/OpenCL/amp_a3.cl +++ b/OpenCL/amp_a3.cl @@ -15,5 +15,5 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const ke const u32 w0r = bfs_buf[0].i; - pws_amp[gid].i[0] |= w0r; + pws[gid].i[0] |= w0r; } diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 1ffbcf33f..44d7bd8c5 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -61606,6 +61606,39 @@ void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u #endif } +static void gpu_decompress_entry (__global pw_idx_t *pws_idx, __global u32 *pws_comp, pw_t *pw, const u64 gid) +{ + const u32 off = pws_idx[gid].off; + const u32 cnt = pws_idx[gid].cnt; + const u32 len = pws_idx[gid].len; + + #pragma unroll + for (u32 i = 0; i < 64; i++) + { + pw->i[i] = 0; + } + + for (u32 i = 0, j = off; i < cnt; i++, j++) + { + pw->i[i] = pws_comp[j]; + } + + pw->pw_len = len; +} + +__kernel void gpu_decompress (__global pw_idx_t *pws_idx, __global u32 *pws_comp, __global pw_t *pws_buf, const u64 gid_max) +{ + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + pw_t pw; + + gpu_decompress_entry (pws_idx, pws_comp, &pw, gid); + + pws_buf[gid] = pw; +} + __kernel void gpu_memset (__global uint4 *buf, const u32 value, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 45dc51ee4..248dfe74f 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -1965,6 +1965,14 @@ typedef struct pw } pw_t; +typedef struct pw_idx +{ + u32 off; + u32 cnt; + u32 len; + +} pw_idx_t; + typedef struct bf { u32 i; diff --git a/docs/changes.txt b/docs/changes.txt index 8f7f01abd..14586299e 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -37,6 +37,7 @@ ## Improvements ## +- OpenCL Kernels: Add a decompressing kernel and a compressing host code in order to reduce PCIe transfer time - OpenCL Kernels: Remove password length restriction to 16 for Cisco-PIX and Cisco-ASA hashes - Terminal: Display Set Cost/Rounds During Benchmarking - Terminal: Show [r]esume in prompt only in pause mode, and show [p]ause in prompt only in resume mode diff --git a/include/types.h b/include/types.h index f3e0e43e7..52b6b5bf2 100644 --- a/include/types.h +++ b/include/types.h @@ -860,6 +860,14 @@ typedef struct pw } pw_t; +typedef struct pw_idx +{ + u32 off; + u32 cnt; + u32 len; + +} pw_idx_t; + typedef struct bf { u32 i; @@ -935,6 +943,7 @@ typedef struct hc_device_param u32 kernel_threads_by_wgs_kernel_tm; u32 kernel_threads_by_wgs_kernel_memset; u32 kernel_threads_by_wgs_kernel_atinit; + u32 kernel_threads_by_wgs_kernel_decompress; u32 kernel_loops; u32 kernel_accel; @@ -951,6 +960,8 @@ typedef struct hc_device_param size_t size_pws; size_t size_pws_amp; + size_t size_pws_comp; + size_t size_pws_idx; size_t size_tmps; size_t size_hooks; size_t size_bfs; @@ -973,8 +984,9 @@ typedef struct hc_device_param void *hooks_buf; - pw_t *pws_buf; - u32 pws_cnt; + pw_idx_t *pws_idx; + u32 *pws_comp; + u32 pws_cnt; u64 words_off; u64 words_done; @@ -1038,6 +1050,7 @@ typedef struct hc_device_param cl_kernel kernel_tm; cl_kernel kernel_memset; cl_kernel kernel_atinit; + cl_kernel kernel_decompress; cl_context context; @@ -1049,6 +1062,8 @@ typedef struct hc_device_param cl_mem d_pws_buf; cl_mem d_pws_amp_buf; + cl_mem d_pws_comp_buf; + cl_mem d_pws_idx; cl_mem d_words_buf_l; cl_mem d_words_buf_r; cl_mem d_rules; @@ -1092,6 +1107,7 @@ typedef struct hc_device_param void *kernel_params_tm[PARAMCNT]; void *kernel_params_memset[PARAMCNT]; void *kernel_params_atinit[PARAMCNT]; + void *kernel_params_decompress[PARAMCNT]; u32 kernel_params_buf32[PARAMCNT]; u64 kernel_params_buf64[PARAMCNT]; @@ -1114,6 +1130,9 @@ typedef struct hc_device_param u32 kernel_params_atinit_buf32[PARAMCNT]; u64 kernel_params_atinit_buf64[PARAMCNT]; + u32 kernel_params_decompress_buf32[PARAMCNT]; + u64 kernel_params_decompress_buf64[PARAMCNT]; + } hc_device_param_t; typedef struct opencl_ctx diff --git a/src/autotune.c b/src/autotune.c index 074e71f4e..54b09c2d5 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -116,16 +116,6 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (CL_rc == -1) return -1; } } - else - { - CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, device_param->d_pws_amp_buf, 0, 0, kernel_power_max * sizeof (pw_t), 0, NULL, NULL); - - if (CL_rc == -1) return -1; - - CL_rc = run_kernel_amp (hashcat_ctx, device_param, kernel_power_max); - - if (CL_rc == -1) return -1; - } // Do a pre-autotune test run to find out if kernel runtime is above some TDR limit @@ -236,13 +226,6 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (CL_rc == -1) return -1; - if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL) - { - CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_pws_amp_buf, 0, device_param->size_pws_amp); - - if (CL_rc == -1) return -1; - } - // reset other buffers in case autotune cracked something CL_rc = run_kernel_memset (hashcat_ctx, device_param, device_param->d_plain_bufs, 0, device_param->size_plains); diff --git a/src/dispatch.c b/src/dispatch.c index b843783e1..c05f5d7af 100644 --- a/src/dispatch.c +++ b/src/dispatch.c @@ -158,7 +158,8 @@ static int calc_stdin (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par u32 words_extra_total = 0; - memset (device_param->pws_buf, 0, device_param->size_pws); + memset (device_param->pws_comp, 0, device_param->size_pws_comp); + memset (device_param->pws_idx, 0, device_param->size_pws_idx); while (device_param->pws_cnt < device_param->kernel_power) { @@ -490,7 +491,8 @@ static int calc (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) u32 words_extra_total = 0; - memset (device_param->pws_buf, 0, device_param->size_pws); + memset (device_param->pws_comp, 0, device_param->size_pws_comp); + memset (device_param->pws_idx, 0, device_param->size_pws_idx); while (words_extra) { diff --git a/src/opencl.c b/src/opencl.c index c5bbd1277..a004dfc66 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -1146,10 +1146,29 @@ int hc_clReleaseEvent (hashcat_ctx_t *hashcat_ctx, cl_event event) int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw) { - int CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, gidd * sizeof (pw_t), sizeof (pw_t), pw, 0, NULL, NULL); + pw_idx_t pw_idx; + + int CL_rc; + + CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, gidd * sizeof (pw_idx_t), sizeof (pw_idx_t), &pw_idx, 0, NULL, NULL); if (CL_rc == -1) return -1; + const u32 off = pw_idx.off; + const u32 cnt = pw_idx.cnt; + const u32 len = pw_idx.len; + + CL_rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, off * sizeof (u32), len * sizeof (u32), pw->i, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + for (u32 i = cnt; i < 64; i++) + { + pw->i[i] = 0; + } + + pw->pw_len = len; + return 0; } @@ -1225,7 +1244,7 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, if (run_init == true) { - CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, device_param->d_pws_amp_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL); + CL_rc = hc_clEnqueueCopyBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_amp_buf, device_param->d_pws_buf, 0, 0, pws_cnt * sizeof (pw_t), 0, NULL, NULL); if (CL_rc == -1) return -1; @@ -1841,11 +1860,99 @@ int run_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par return 0; } +int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num) +{ + u64 num_elements = num; + + device_param->kernel_params_decompress_buf64[3] = num_elements; + + const u64 kernel_threads = device_param->kernel_threads_by_wgs_kernel_decompress; + + while (num_elements % kernel_threads) num_elements++; + + cl_kernel kernel = device_param->kernel_decompress; + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + int CL_rc; + + CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue); + + if (CL_rc == -1) return -1; + + return 0; +} + int run_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size) { return run_kernel_memset (hashcat_ctx, device_param, buf, 0, size); } +void rebuild_pws_compressed_append (hc_device_param_t *device_param, const u32 pws_cnt, const u8 chr) +{ + // this function is used if we have to modify the compressed pws buffer in order to + // append some data to each password candidate + + u32 *tmp_pws_comp = (u32 *) hcmalloc (device_param->size_pws_comp); + pw_idx_t *tmp_pws_idx = (pw_idx_t *) hcmalloc (device_param->size_pws_idx); + + for (u32 i = 0; i < pws_cnt; i++) + { + pw_idx_t *pw_idx_src = device_param->pws_idx + i; + pw_idx_t *pw_idx_dst = tmp_pws_idx + i; + + const u32 src_off = pw_idx_src->off; + const u32 src_cnt = pw_idx_src->cnt; + const u32 src_len = pw_idx_src->len; + + u8 buf[256]; + + memcpy (buf, device_param->pws_comp + src_off, src_len); + + buf[src_len] = chr; + + const u32 dst_len = src_len + 1; + + const u32 dst_pw_len4 = (dst_len + 3) & ~3; // round up to multiple of 4 + + const u32 dst_pw_len4_cnt = dst_pw_len4 / 4; + + pw_idx_dst->cnt = dst_pw_len4_cnt; + pw_idx_dst->len = src_len; // this is intenionally! src_len can not be dst_len, we dont want the kernel to think 0x80 is part of the password + + u8 *dst = (u8 *) (tmp_pws_comp + pw_idx_dst->off); + + memcpy (dst, buf, dst_len); + + memset (dst + dst_len, 0, dst_pw_len4 - dst_len); + + // prepare next element + + pw_idx_t *pw_idx_dst_next = pw_idx_dst + 1; + + pw_idx_dst_next->off = pw_idx_dst->off + pw_idx_dst->cnt; + } + + memcpy (device_param->pws_comp, tmp_pws_comp, device_param->size_pws_comp); + memcpy (device_param->pws_idx, tmp_pws_idx, device_param->size_pws_idx); + + hcfree (tmp_pws_comp); + hcfree (tmp_pws_idx); +} + int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 pws_cnt) { combinator_ctx_t *combinator_ctx = hashcat_ctx->combinator_ctx; @@ -1855,7 +1962,21 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const if (user_options_extra->attack_kern == ATTACK_KERN_STRAIGHT) { - const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, pws_cnt * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + int CL_rc; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt; + + const u32 off = pw_idx->off; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt); if (CL_rc == -1) return -1; } @@ -1869,25 +1990,11 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const { if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) { - for (u32 i = 0; i < pws_cnt; i++) - { - const u32 pw_len = device_param->pws_buf[i].pw_len; - - u8 *ptr = (u8 *) device_param->pws_buf[i].i; - - ptr[pw_len] = 0x01; - } + rebuild_pws_compressed_append (device_param, pws_cnt, 0x01); } else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { - for (u32 i = 0; i < pws_cnt; i++) - { - const u32 pw_len = device_param->pws_buf[i].pw_len; - - u8 *ptr = (u8 *) device_param->pws_buf[i].i; - - ptr[pw_len] = 0x80; - } + rebuild_pws_compressed_append (device_param, pws_cnt, 0x80); } } } @@ -1895,29 +2002,29 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const { if (hashconfig->opts_type & OPTS_TYPE_PT_ADD01) { - for (u32 i = 0; i < pws_cnt; i++) - { - const u32 pw_len = device_param->pws_buf[i].pw_len; - - u8 *ptr = (u8 *) device_param->pws_buf[i].i; - - ptr[pw_len] = 0x01; - } + rebuild_pws_compressed_append (device_param, pws_cnt, 0x01); } else if (hashconfig->opts_type & OPTS_TYPE_PT_ADD80) { - for (u32 i = 0; i < pws_cnt; i++) - { - const u32 pw_len = device_param->pws_buf[i].pw_len; - - u8 *ptr = (u8 *) device_param->pws_buf[i].i; - - ptr[pw_len] = 0x80; - } + rebuild_pws_compressed_append (device_param, pws_cnt, 0x80); } } - const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, pws_cnt * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + int CL_rc; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt; + + const u32 off = pw_idx->off; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt); if (CL_rc == -1) return -1; } @@ -1925,13 +2032,41 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const { if (user_options->attack_mode == ATTACK_MODE_COMBI) { - const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, pws_cnt * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + int CL_rc; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt; + + const u32 off = pw_idx->off; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt); if (CL_rc == -1) return -1; } else if (user_options->attack_mode == ATTACK_MODE_HYBRID1) { - const int CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, pws_cnt * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + int CL_rc; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt; + + const u32 off = pw_idx->off; + + CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = run_kernel_decompress (hashcat_ctx, device_param, pws_cnt); if (CL_rc == -1) return -1; } @@ -4096,6 +4231,14 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) const size_t size_pws_amp = size_pws; + // size_pws_comp + + const size_t size_pws_comp = (size_t) kernel_power_max * (sizeof (u32) * 64); + + // size_pws_idx + + const size_t size_pws_idx = (size_t) (kernel_power_max + 1) * sizeof (pw_idx_t); + // size_tmps const size_t size_tmps = (size_t) kernel_power_max * hashconfig->tmp_size; @@ -4122,6 +4265,8 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) + size_plains + size_pws + size_pws_amp + + size_pws_comp + + size_pws_idx + size_results + size_root_css + size_rules @@ -4205,10 +4350,12 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) // find out if we would request too much memory on memory blocks which are based on kernel_accel - size_t size_pws = 4; - size_t size_pws_amp = 4; - size_t size_tmps = 4; - size_t size_hooks = 4; + size_t size_pws = 4; + size_t size_pws_amp = 4; + size_t size_pws_comp = 4; + size_t size_pws_idx = 4; + size_t size_tmps = 4; + size_t size_hooks = 4; while (kernel_accel_max >= kernel_accel_min) { @@ -4220,6 +4367,14 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) size_pws_amp = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? 1 : size_pws; + // size_pws_comp + + size_pws_comp = (size_t) kernel_power_max * (sizeof (u32) * 64); + + // size_pws_idx + + size_pws_idx = (size_t) (kernel_power_max + 1) * sizeof (pw_idx_t); + // size_tmps size_tmps = (size_t) kernel_power_max * hashconfig->tmp_size; @@ -4255,6 +4410,8 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) + size_plains + size_pws + size_pws_amp + + size_pws_comp + + size_pws_idx + size_results + size_root_css + size_rules @@ -4302,14 +4459,16 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) } */ - device_param->size_bfs = size_bfs; - device_param->size_combs = size_combs; - device_param->size_rules = size_rules; - device_param->size_rules_c = size_rules_c; - device_param->size_pws = size_pws; - device_param->size_pws_amp = size_pws_amp; - device_param->size_tmps = size_tmps; - device_param->size_hooks = size_hooks; + device_param->size_bfs = size_bfs; + device_param->size_combs = size_combs; + device_param->size_rules = size_rules; + device_param->size_rules_c = size_rules_c; + device_param->size_pws = size_pws; + device_param->size_pws_amp = size_pws_amp; + device_param->size_pws_comp = size_pws_comp; + device_param->size_pws_idx = size_pws_idx; + device_param->size_tmps = size_tmps; + device_param->size_hooks = size_hooks; /** * default building options @@ -4940,8 +5099,10 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) * global buffers */ - CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_pws, NULL, &device_param->d_pws_buf); if (CL_rc == -1) return -1; - CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_pws_amp, NULL, &device_param->d_pws_amp_buf); if (CL_rc == -1) return -1; + CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE, size_pws, NULL, &device_param->d_pws_buf); if (CL_rc == -1) return -1; + CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE, size_pws_amp, NULL, &device_param->d_pws_amp_buf); if (CL_rc == -1) return -1; + CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_pws_comp, NULL, &device_param->d_pws_comp_buf); if (CL_rc == -1) return -1; + CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, size_pws_idx, NULL, &device_param->d_pws_idx); if (CL_rc == -1) return -1; CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE, size_tmps, NULL, &device_param->d_tmps); if (CL_rc == -1) return -1; CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_WRITE, size_hooks, NULL, &device_param->d_hooks); if (CL_rc == -1) return -1; CL_rc = hc_clCreateBuffer (hashcat_ctx, device_param->context, CL_MEM_READ_ONLY, bitmap_ctx->bitmap_size, NULL, &device_param->d_bitmap_s1_a); if (CL_rc == -1) return -1; @@ -5034,9 +5195,13 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) * main host data */ - pw_t *pws_buf = (pw_t *) hcmalloc (size_pws); + u32 *pws_comp = (u32 *) hcmalloc (size_pws_comp); - device_param->pws_buf = pws_buf; + device_param->pws_comp = pws_comp; + + pw_idx_t *pws_idx = (pw_idx_t *) hcmalloc (size_pws_idx); + + device_param->pws_idx = pws_idx; pw_t *combs_buf = (pw_t *) hccalloc (KERNEL_COMBS, sizeof (pw_t)); @@ -5062,9 +5227,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_buf32[33] = 0; // combs_mode device_param->kernel_params_buf64[34] = 0; // gid_max - device_param->kernel_params[ 0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) - ? &device_param->d_pws_buf - : &device_param->d_pws_amp_buf; + device_param->kernel_params[ 0] = &device_param->d_pws_buf; device_param->kernel_params[ 1] = &device_param->d_rules_c; device_param->kernel_params[ 2] = &device_param->d_combs_c; device_param->kernel_params[ 3] = &device_param->d_bfs_c; @@ -5119,7 +5282,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) } else { - device_param->kernel_params_mp[0] = &device_param->d_pws_buf; + device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->d_pws_buf + : &device_param->d_pws_amp_buf; } } @@ -5140,7 +5305,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_mp_l_buf32[8] = 0; device_param->kernel_params_mp_l_buf64[9] = 0; - device_param->kernel_params_mp_l[0] = &device_param->d_pws_buf; + device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->d_pws_buf + : &device_param->d_pws_amp_buf; device_param->kernel_params_mp_l[1] = &device_param->d_root_css_buf; device_param->kernel_params_mp_l[2] = &device_param->d_markov_css_buf; device_param->kernel_params_mp_l[3] = &device_param->kernel_params_mp_l_buf64[3]; @@ -5194,6 +5361,15 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_atinit[0] = NULL; device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1]; + device_param->kernel_params_decompress_buf64[3] = 0; // gid_max + + device_param->kernel_params_decompress[0] = &device_param->d_pws_idx; + device_param->kernel_params_decompress[1] = &device_param->d_pws_comp_buf; + device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->d_pws_buf + : &device_param->d_pws_amp_buf; + device_param->kernel_params_decompress[3] = &device_param->kernel_params_decompress_buf64[3]; + /** * kernel name */ @@ -5451,6 +5627,21 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1; CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1; + // GPU decompress + + CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, "gpu_decompress", &device_param->kernel_decompress); + + if (CL_rc == -1) return -1; + + CL_rc = get_kernel_threads (hashcat_ctx, device_param, device_param->kernel_decompress, &device_param->kernel_threads_by_wgs_kernel_decompress); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1; + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1; + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1; + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_decompress, 3, sizeof (cl_ulong), device_param->kernel_params_decompress[3]); if (CL_rc == -1) return -1; + // MP start if (user_options->attack_mode == ATTACK_MODE_BF) @@ -5537,7 +5728,7 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) if (CL_rc == -1) return -1; } - for (u32 i = 7; i < 7; i++) + for (u32 i = 6; i < 7; i++) { CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_amp, i, sizeof (cl_ulong), device_param->kernel_params_amp[i]); @@ -5549,6 +5740,8 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_buf, device_param->size_pws); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_amp_buf, device_param->size_pws_amp); if (CL_rc == -1) return -1; + CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_comp_buf, device_param->size_pws_comp); if (CL_rc == -1) return -1; + CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_idx, device_param->size_pws_idx); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tmps, device_param->size_tmps); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_hooks, device_param->size_hooks); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_plain_bufs, device_param->size_plains); if (CL_rc == -1) return -1; @@ -5666,12 +5859,15 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->skipped == true) continue; - hcfree (device_param->pws_buf); + hcfree (device_param->pws_comp); + hcfree (device_param->pws_idx); hcfree (device_param->combs_buf); hcfree (device_param->hooks_buf); if (device_param->d_pws_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_buf); if (device_param->d_pws_amp_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_amp_buf); + if (device_param->d_pws_comp_buf) hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_comp_buf); + if (device_param->d_pws_idx) hc_clReleaseMemObject (hashcat_ctx, device_param->d_pws_idx); if (device_param->d_rules) hc_clReleaseMemObject (hashcat_ctx, device_param->d_rules); if (device_param->d_rules_c) hc_clReleaseMemObject (hashcat_ctx, device_param->d_rules_c); if (device_param->d_combs) hc_clReleaseMemObject (hashcat_ctx, device_param->d_combs); @@ -5720,6 +5916,7 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->kernel_amp) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_amp); if (device_param->kernel_memset) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_memset); if (device_param->kernel_atinit) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_atinit); + if (device_param->kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->kernel_decompress); if (device_param->program) hc_clReleaseProgram (hashcat_ctx, device_param->program); if (device_param->program_mp) hc_clReleaseProgram (hashcat_ctx, device_param->program_mp); @@ -5729,12 +5926,15 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->context) hc_clReleaseContext (hashcat_ctx, device_param->context); - device_param->pws_buf = NULL; + device_param->pws_comp = NULL; + device_param->pws_idx = NULL; device_param->combs_buf = NULL; device_param->hooks_buf = NULL; device_param->d_pws_buf = NULL; device_param->d_pws_amp_buf = NULL; + device_param->d_pws_comp_buf = NULL; + device_param->d_pws_idx = NULL; device_param->d_rules = NULL; device_param->d_rules_c = NULL; device_param->d_combs = NULL; @@ -5782,6 +5982,7 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->kernel_amp = NULL; device_param->kernel_memset = NULL; device_param->kernel_atinit = NULL; + device_param->kernel_decompress = NULL; device_param->program = NULL; device_param->program_mp = NULL; device_param->program_amp = NULL; @@ -5818,7 +6019,8 @@ void opencl_session_reset (hashcat_ctx_t *hashcat_ctx) // some more resets: - if (device_param->pws_buf) memset (device_param->pws_buf, 0, device_param->size_pws); + if (device_param->pws_comp) memset (device_param->pws_comp, 0, device_param->size_pws_comp); + if (device_param->pws_idx) memset (device_param->pws_idx, 0, device_param->size_pws_idx); device_param->pws_cnt = 0; diff --git a/src/selftest.c b/src/selftest.c index 4e741ea76..bea6580f2 100644 --- a/src/selftest.c +++ b/src/selftest.c @@ -296,7 +296,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param pw.pw_len = pw_len; - CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL); + CL_err = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL); if (CL_err != CL_SUCCESS) return -1; } @@ -481,7 +481,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param device_param->kernel_params[18] = &device_param->d_esalt_bufs; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_buf, device_param->size_pws); if (CL_rc == -1) return -1; - CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_pws_amp_buf, device_param->size_pws_amp); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_tmps, device_param->size_tmps); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_hooks, device_param->size_hooks); if (CL_rc == -1) return -1; CL_rc = run_kernel_bzero (hashcat_ctx, device_param, device_param->d_plain_bufs, device_param->size_plains); if (CL_rc == -1) return -1; diff --git a/src/wordlist.c b/src/wordlist.c index 4bea6a45d..8ce9b8ed5 100644 --- a/src/wordlist.c +++ b/src/wordlist.c @@ -242,16 +242,26 @@ void pw_add (hc_device_param_t *device_param, const u8 *pw_buf, const int pw_len { if (device_param->pws_cnt < device_param->kernel_power) { - pw_t *pw = device_param->pws_buf + device_param->pws_cnt; + pw_idx_t *pw_idx = device_param->pws_idx + device_param->pws_cnt; - u8 *ptr = (u8 *) pw->i; + const u32 pw_len4 = (pw_len + 3) & ~3; // round up to multiple of 4 - memcpy (ptr, pw_buf, pw_len); + const u32 pw_len4_cnt = pw_len4 / 4; - //memset zero to entire buffer done in outer loop - //memset (ptr + pw_len, 0, sizeof (pw->i) - pw_len); + pw_idx->cnt = pw_len4_cnt; + pw_idx->len = pw_len; - pw->pw_len = pw_len; + u8 *dst = (u8 *) (device_param->pws_comp + pw_idx->off); + + memcpy (dst, pw_buf, pw_len); + + memset (dst + pw_len, 0, pw_len4 - pw_len); + + // prepare next element + + pw_idx_t *pw_idx_next = pw_idx + 1; + + pw_idx_next->off = pw_idx->off + pw_idx->cnt; device_param->pws_cnt++; }