From 2a1d0d21ff25563de9705f2e64b4040cbed36c90 Mon Sep 17 00:00:00 2001 From: Gabriele 'matrix' Gristina Date: Fri, 12 Feb 2016 17:11:23 +0100 Subject: [PATCH] Get rid of hc_clEnqueueNDRangeKernel double call --- include/ext_OpenCL.h | 2 +- src/ext_OpenCL.c | 13 ++------ src/oclHashcat.c | 73 ++++++++++++++++++-------------------------- 3 files changed, 33 insertions(+), 55 deletions(-) diff --git a/include/ext_OpenCL.h b/include/ext_OpenCL.h index 926fc335c..d550c9f8e 100644 --- a/include/ext_OpenCL.h +++ b/include/ext_OpenCL.h @@ -116,7 +116,7 @@ cl_kernel hc_clCreateKernel (OCL_PTR *ocl, cl_program program, const char *kerne cl_program hc_clCreateProgramWithSource (OCL_PTR *ocl, cl_context context, cl_uint count, const char **strings, const size_t *lengths); cl_program hc_clCreateProgramWithBinary (OCL_PTR *ocl, cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status); cl_int hc_clBuildProgram (OCL_PTR *ocl, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data, bool exitOnFail); -cl_int hc_clEnqueueNDRangeKernel (OCL_PTR *ocl, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, bool exitOnFail); +void hc_clEnqueueNDRangeKernel (OCL_PTR *ocl, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); void hc_clEnqueueReadBuffer (OCL_PTR *ocl, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); void hc_clEnqueueWriteBuffer (OCL_PTR *ocl, cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); void hc_clEnqueueCopyBuffer (OCL_PTR *ocl, cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); diff --git a/src/ext_OpenCL.c b/src/ext_OpenCL.c index 60b737816..65c594c03 100644 --- a/src/ext_OpenCL.c +++ b/src/ext_OpenCL.c @@ -130,23 +130,16 @@ void ocl_close (OCL_PTR *ocl) } } -cl_int hc_clEnqueueNDRangeKernel (OCL_PTR *ocl, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, bool exitOnFail) +void hc_clEnqueueNDRangeKernel (OCL_PTR *ocl, cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { cl_int CL_err = ocl->clEnqueueNDRangeKernel (command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); if (CL_err != CL_SUCCESS) { - if (exitOnFail) - { - log_error ("ERROR: %s : %d : %s\n", "clEnqueueNDRangeKernel()", CL_err, val2cstr_cl (CL_err)); + log_error ("ERROR: %s : %d : %s\n", "clEnqueueNDRangeKernel()", CL_err, val2cstr_cl (CL_err)); - exit (-1); - } - - return (-1); + exit (-1); } - - return 0; } void hc_clGetEventInfo (OCL_PTR *ocl, cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 4d54dbad3..63a51cda3 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -2475,21 +2475,18 @@ static void run_kernel (const uint kern_run, hc_device_param_t *device_param, co const size_t global_work_size[3] = { num_elements, 32, 1 }; const size_t local_work_size[3] = { kernel_threads / 32, 32, 1 }; - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event, true); + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event); } else { + size_t workgroup_size = 0; + hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL); + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - const cl_int rc = hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event, false); - - if (rc != CL_SUCCESS) - { - const size_t local_work_size_fallback[3] = { 1, 1, 1 }; - - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size_fallback, 0, NULL, &event, true); - } + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event); } hc_clFlush (data.ocl, device_param->command_queue); @@ -2539,7 +2536,7 @@ static void run_kernel_mp (const uint kern_run, hc_device_param_t *device_param, // causes problems with special threads like in bcrypt // const uint kernel_threads = device_param->kernel_threads; - const uint kernel_threads = KERNEL_THREADS; + uint kernel_threads = KERNEL_THREADS; while (num_elements % kernel_threads) num_elements++; @@ -2578,17 +2575,14 @@ static void run_kernel_mp (const uint kern_run, hc_device_param_t *device_param, break; } + size_t workgroup_size = 0; + hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - const cl_int rc = hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL, false); - - if (rc != CL_SUCCESS) - { - const size_t local_work_size_fallback[3] = { 1, 1, 1 }; - - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size_fallback, 0, NULL, NULL, true); - } + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); hc_clFlush (data.ocl, device_param->command_queue); @@ -2605,17 +2599,14 @@ static void run_kernel_tb (hc_device_param_t *device_param, const uint num) cl_kernel kernel = device_param->kernel_tb; + size_t workgroup_size = 0; + hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL); + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - const cl_int rc = hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL, false); - - if (rc != CL_SUCCESS) - { - const size_t local_work_size_fallback[3] = { 1, 1, 1 }; - - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size_fallback, 0, NULL, NULL, true); - } + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); hc_clFlush (data.ocl, device_param->command_queue); @@ -2626,21 +2617,18 @@ static void run_kernel_tm (hc_device_param_t *device_param) { const uint num_elements = 1024; // fixed - const uint kernel_threads = 32; + uint kernel_threads = 32; cl_kernel kernel = device_param->kernel_tm; + size_t workgroup_size = 0; + hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL); + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - const cl_int rc = hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL, false); - - if (rc != CL_SUCCESS) - { - const size_t local_work_size_fallback[3] = { 1, 1, 1 }; - - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size_fallback, 0, NULL, NULL, true); - } + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); hc_clFlush (data.ocl, device_param->command_queue); @@ -2657,7 +2645,7 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num) // causes problems with special threads like in bcrypt // const uint kernel_threads = device_param->kernel_threads; - const uint kernel_threads = KERNEL_THREADS; + uint kernel_threads = KERNEL_THREADS; while (num_elements % kernel_threads) num_elements++; @@ -2666,17 +2654,14 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num) hc_clSetKernelArg (data.ocl, kernel, 5, sizeof (cl_uint), device_param->kernel_params_amp[5]); hc_clSetKernelArg (data.ocl, kernel, 6, sizeof (cl_uint), device_param->kernel_params_amp[6]); + size_t workgroup_size = 0; + hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL); + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - const cl_int rc = hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL, false); - - if (rc != CL_SUCCESS) - { - const size_t local_work_size_fallback[3] = { 1, 1, 1 }; - - hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size_fallback, 0, NULL, NULL, true); - } + hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); hc_clFlush (data.ocl, device_param->command_queue);