From 7062425d2b5c006948a511c5b1e3472b0131da0d Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 23 Jan 2018 20:33:26 +0100 Subject: [PATCH] OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements to reduce startup time --- OpenCL/inc_common.cl | 81 ++++++++++++++++++++++++++++++++++++++++++++ docs/changes.txt | 1 + include/opencl.h | 1 + include/types.h | 6 ++++ src/autotune.c | 9 +---- src/opencl.c | 58 +++++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+), 8 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 3b11d722a..f0e924d78 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -61594,3 +61594,84 @@ __kernel void gpu_memset (__global uint4 *buf, const u32 value, const u64 gid_ma buf[gid] = (uint4) (value); } + +__kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max) +{ + const u64 gid = get_global_id (0); + + if (gid >= gid_max) return; + + const u32 l32 = l32_from_64_S (gid); + const u32 h32 = h32_from_64_S (gid); + + pw_t pw; + + pw.i[ 0] = 0x5c5c5c5c ^ l32; + pw.i[ 1] = 0x36363636 ^ h32; + pw.i[ 2] = 0; + pw.i[ 3] = 0; + pw.i[ 4] = 0; + pw.i[ 5] = 0; + pw.i[ 6] = 0; + pw.i[ 7] = 0; + pw.i[ 8] = 0; + pw.i[ 9] = 0; + pw.i[10] = 0; + pw.i[11] = 0; + pw.i[12] = 0; + pw.i[13] = 0; + pw.i[14] = 0; + pw.i[15] = 0; + pw.i[16] = 0; + pw.i[17] = 0; + pw.i[18] = 0; + pw.i[19] = 0; + pw.i[20] = 0; + pw.i[21] = 0; + pw.i[22] = 0; + pw.i[23] = 0; + pw.i[24] = 0; + pw.i[25] = 0; + pw.i[26] = 0; + pw.i[27] = 0; + pw.i[28] = 0; + pw.i[29] = 0; + pw.i[30] = 0; + pw.i[31] = 0; + pw.i[32] = 0; + pw.i[33] = 0; + pw.i[34] = 0; + pw.i[35] = 0; + pw.i[36] = 0; + pw.i[37] = 0; + pw.i[38] = 0; + pw.i[39] = 0; + pw.i[40] = 0; + pw.i[41] = 0; + pw.i[42] = 0; + pw.i[43] = 0; + pw.i[44] = 0; + pw.i[45] = 0; + pw.i[46] = 0; + pw.i[47] = 0; + pw.i[48] = 0; + pw.i[49] = 0; + pw.i[50] = 0; + pw.i[51] = 0; + pw.i[52] = 0; + pw.i[53] = 0; + pw.i[54] = 0; + pw.i[55] = 0; + pw.i[56] = 0; + pw.i[57] = 0; + pw.i[58] = 0; + pw.i[59] = 0; + pw.i[60] = 0; + pw.i[61] = 0; + pw.i[62] = 0; + pw.i[63] = 0; // yep that's faster + + pw.pw_len = 1 + (l32 & 15); + + buf[gid] = pw; +} diff --git a/docs/changes.txt b/docs/changes.txt index 5b3f19fc7..90cb0fa4f 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -45,6 +45,7 @@ - Files: Copy include/ folder and its content to case SHARED is set to 1 in Makefile - Hash Parser: Changed the way large strings are handled/truncated within the event buffer if they are too large to fit - Hash Parser: Fixed our use of strtok_r () calls +- OpenCL Kernels: Use a special kernel to initialize the password buffer used during autotune measurements to reduce startup time - OpenCL Kernels: Replace variables from uXX to uXXa if used in __constant space - OpenCL Kernels: Use static declaraction for uXXa variables used in __constant space - Self Test: Skip self-test for mode 8900 user-configurable scrypt settings are incompatible to fixed settings in the self-test hash diff --git a/include/opencl.h b/include/opencl.h index 7955dab52..c30c35d1b 100644 --- a/include/opencl.h +++ b/include/opencl.h @@ -63,6 +63,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 num); int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param); int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 num); +int run_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num); int run_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size); int run_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size); int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 pws_cnt); diff --git a/include/types.h b/include/types.h index 4dfdf9a75..55060c3c1 100644 --- a/include/types.h +++ b/include/types.h @@ -934,6 +934,7 @@ typedef struct hc_device_param u32 kernel_threads_by_wgs_kernel_amp; u32 kernel_threads_by_wgs_kernel_tm; u32 kernel_threads_by_wgs_kernel_memset; + u32 kernel_threads_by_wgs_kernel_atinit; u32 kernel_loops; u32 kernel_accel; @@ -1034,6 +1035,7 @@ typedef struct hc_device_param cl_kernel kernel_amp; cl_kernel kernel_tm; cl_kernel kernel_memset; + cl_kernel kernel_atinit; cl_context context; @@ -1087,6 +1089,7 @@ typedef struct hc_device_param void *kernel_params_amp[PARAMCNT]; void *kernel_params_tm[PARAMCNT]; void *kernel_params_memset[PARAMCNT]; + void *kernel_params_atinit[PARAMCNT]; u32 kernel_params_buf32[PARAMCNT]; u64 kernel_params_buf64[PARAMCNT]; @@ -1106,6 +1109,9 @@ typedef struct hc_device_param u32 kernel_params_memset_buf32[PARAMCNT]; u64 kernel_params_memset_buf64[PARAMCNT]; + u32 kernel_params_atinit_buf32[PARAMCNT]; + u64 kernel_params_atinit_buf64[PARAMCNT]; + } hc_device_param_t; typedef struct opencl_ctx diff --git a/src/autotune.c b/src/autotune.c index 459877e82..7ef4111fe 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -103,14 +103,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param int CL_rc; - for (u32 i = 0; i < kernel_power_max; i++) - { - device_param->pws_buf[i].i[0] = i; - device_param->pws_buf[i].i[1] = 0x01234567; - device_param->pws_buf[i].pw_len = 7 + (i & 7); - } - - CL_rc = hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power_max * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL); + CL_rc = run_kernel_atinit (hashcat_ctx, device_param, device_param->d_pws_buf, kernel_power_max); if (CL_rc == -1) return -1; diff --git a/src/opencl.c b/src/opencl.c index 369d831eb..b82fd048a 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -1727,6 +1727,44 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, return 0; } +int run_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num) +{ + u64 num_elements = num; + + device_param->kernel_params_atinit_buf64[1] = num_elements; + + const u64 kernel_threads = device_param->kernel_threads_by_wgs_kernel_atinit; + + cl_kernel kernel = device_param->kernel_atinit; + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + int CL_rc; + + CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFlush (hashcat_ctx, device_param->command_queue); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFinish (hashcat_ctx, device_param->command_queue); + + if (CL_rc == -1) return -1; + + return 0; +} + int run_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u32 value, const u64 size) { const u64 num16d = size / 16; @@ -5075,6 +5113,11 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1]; device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2]; + device_param->kernel_params_atinit_buf64[1] = 0; // gid_max + + device_param->kernel_params_atinit[0] = NULL; + device_param->kernel_params_atinit[1] = &device_param->kernel_params_atinit_buf64[1]; + /** * kernel name */ @@ -5319,6 +5362,19 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1; CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1; + // GPU autotune init + + CL_rc = hc_clCreateKernel (hashcat_ctx, device_param->program, "gpu_atinit", &device_param->kernel_atinit); + + if (CL_rc == -1) return -1; + + CL_rc = get_kernel_threads (hashcat_ctx, device_param, device_param->kernel_atinit, &device_param->kernel_threads_by_wgs_kernel_atinit); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]); if (CL_rc == -1) return -1; + CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->kernel_atinit, 1, sizeof (cl_ulong), device_param->kernel_params_atinit[1]); if (CL_rc == -1) return -1; + // MP start if (user_options->attack_mode == ATTACK_MODE_BF) @@ -5587,6 +5643,7 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->kernel_tm) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_tm); if (device_param->kernel_amp) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_amp); if (device_param->kernel_memset) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_memset); + if (device_param->kernel_atinit) hc_clReleaseKernel (hashcat_ctx, device_param->kernel_atinit); if (device_param->program) hc_clReleaseProgram (hashcat_ctx, device_param->program); if (device_param->program_mp) hc_clReleaseProgram (hashcat_ctx, device_param->program_mp); @@ -5648,6 +5705,7 @@ void opencl_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->kernel_tm = NULL; device_param->kernel_amp = NULL; device_param->kernel_memset = NULL; + device_param->kernel_atinit = NULL; device_param->program = NULL; device_param->program_mp = NULL; device_param->program_amp = NULL;