From 4df00033d71b52295996c22a82f28a154ec59c22 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 4 May 2019 10:44:03 +0200 Subject: [PATCH] Prepare CUDA events --- include/backend.h | 14 +- include/types.h | 1 - src/backend.c | 734 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 559 insertions(+), 190 deletions(-) diff --git a/include/backend.h b/include/backend.h index fede1f92b..057edb87f 100644 --- a/include/backend.h +++ b/include/backend.h @@ -42,27 +42,33 @@ int hc_nvrtcGetPTX (hashcat_ctx_t *hashcat_ctx, nvrtcProgram prog, int hc_cuCtxCreate (hashcat_ctx_t *hashcat_ctx, CUcontext *pctx, unsigned int flags, CUdevice dev); int hc_cuCtxDestroy (hashcat_ctx_t *hashcat_ctx, CUcontext ctx); int hc_cuCtxSetCurrent (hashcat_ctx_t *hashcat_ctx, CUcontext ctx); +int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx); int hc_cuDeviceGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUdevice_attribute attrib, CUdevice dev); int hc_cuDeviceGetCount (hashcat_ctx_t *hashcat_ctx, int *count); int hc_cuDeviceGet (hashcat_ctx_t *hashcat_ctx, CUdevice *device, int ordinal); int hc_cuDeviceGetName (hashcat_ctx_t *hashcat_ctx, char *name, int len, CUdevice dev); int hc_cuDeviceTotalMem (hashcat_ctx_t *hashcat_ctx, size_t *bytes, CUdevice dev); int hc_cuDriverGetVersion (hashcat_ctx_t *hashcat_ctx, int *driverVersion); +int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags); +int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent); +int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd); +int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent); +int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream); +int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent); +int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc); int hc_cuInit (hashcat_ctx_t *hashcat_ctx, unsigned int Flags); +int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); int hc_cuMemAlloc (hashcat_ctx_t *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize); int hc_cuMemcpyDtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); int hc_cuMemcpyDtoH (hashcat_ctx_t *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); int hc_cuMemcpyHtoD (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); int hc_cuMemFree (hashcat_ctx_t *hashcat_ctx, CUdeviceptr dptr); +int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name); int hc_cuModuleLoadDataEx (hashcat_ctx_t *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); int hc_cuModuleUnload (hashcat_ctx_t *hashcat_ctx, CUmodule hmod); -int hc_cuModuleGetFunction (hashcat_ctx_t *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name); -int hc_cuFuncGetAttribute (hashcat_ctx_t *hashcat_ctx, int *pi, CUfunction_attribute attrib, CUfunction hfunc); int hc_cuStreamCreate (hashcat_ctx_t *hashcat_ctx, CUstream *phStream, unsigned int Flags); int hc_cuStreamDestroy (hashcat_ctx_t *hashcat_ctx, CUstream hStream); int hc_cuStreamSynchronize (hashcat_ctx_t *hashcat_ctx, CUstream hStream); -int hc_cuLaunchKernel (hashcat_ctx_t *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); -int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx); int hc_clBuildProgram (hashcat_ctx_t *hashcat_ctx, cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data), void *user_data); int hc_clCreateBuffer (hashcat_ctx_t *hashcat_ctx, cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_mem *mem); diff --git a/include/types.h b/include/types.h index 861b27858..5ff50d5d4 100644 --- a/include/types.h +++ b/include/types.h @@ -1206,7 +1206,6 @@ typedef struct hc_device_param double spin_damp; - void *kernel_params[PARAMCNT]; void *kernel_params_mp[PARAMCNT]; void *kernel_params_mp_r[PARAMCNT]; diff --git a/src/backend.c b/src/backend.c index 47c90887c..434abb364 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1599,6 +1599,168 @@ int hc_cuCtxSynchronize (hashcat_ctx_t *hashcat_ctx) return 0; } +int hc_cuEventCreate (hashcat_ctx_t *hashcat_ctx, CUevent *phEvent, unsigned int Flags) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventCreate (phEvent, Flags); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventCreate(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventCreate(): %d", CU_err); + } + + return -1; + } + + return 0; +} + +int hc_cuEventDestroy (hashcat_ctx_t *hashcat_ctx, CUevent hEvent) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventDestroy (hEvent); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventDestroy(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventDestroy(): %d", CU_err); + } + + return -1; + } + + return 0; +} + +int hc_cuEventElapsedTime (hashcat_ctx_t *hashcat_ctx, float *pMilliseconds, CUevent hStart, CUevent hEnd) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventElapsedTime (pMilliseconds, hStart, hEnd); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventElapsedTime(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventElapsedTime(): %d", CU_err); + } + + return -1; + } + + return 0; +} + +int hc_cuEventQuery (hashcat_ctx_t *hashcat_ctx, CUevent hEvent) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventQuery (hEvent); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventQuery(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventQuery(): %d", CU_err); + } + + return -1; + } + + return 0; +} + +int hc_cuEventRecord (hashcat_ctx_t *hashcat_ctx, CUevent hEvent, CUstream hStream) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventRecord (hEvent, hStream); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventRecord(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventRecord(): %d", CU_err); + } + + return -1; + } + + return 0; +} + +int hc_cuEventSynchronize (hashcat_ctx_t *hashcat_ctx, CUevent hEvent) +{ + backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; + + CUDA_PTR *cuda = backend_ctx->cuda; + + const CUresult CU_err = cuda->cuEventSynchronize (hEvent); + + if (CU_err != CUDA_SUCCESS) + { + const char *pStr = NULL; + + if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS) + { + event_log_error (hashcat_ctx, "cuEventSynchronize(): %s", pStr); + } + else + { + event_log_error (hashcat_ctx, "cuEventSynchronize(): %d", CU_err); + } + + return -1; + } + + return 0; +} + // OpenCL int ocl_init (hashcat_ctx_t *hashcat_ctx) @@ -2623,249 +2785,424 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con device_param->kernel_params_buf64[34] = num; - u64 kernel_threads = 0; - cl_kernel kernel = NULL; + u64 kernel_threads = 0; switch (kern_run) { - case KERN_RUN_1: - kernel = device_param->opencl_kernel1; - kernel_threads = device_param->kernel_wgs1; - break; - case KERN_RUN_12: - kernel = device_param->opencl_kernel12; - kernel_threads = device_param->kernel_wgs12; - break; - case KERN_RUN_2: - kernel = device_param->opencl_kernel2; - kernel_threads = device_param->kernel_wgs2; - break; - case KERN_RUN_23: - kernel = device_param->opencl_kernel23; - kernel_threads = device_param->kernel_wgs23; - break; - case KERN_RUN_3: - kernel = device_param->opencl_kernel3; - kernel_threads = device_param->kernel_wgs3; - break; - case KERN_RUN_4: - kernel = device_param->opencl_kernel4; - kernel_threads = device_param->kernel_wgs4; - break; - case KERN_RUN_INIT2: - kernel = device_param->opencl_kernel_init2; - kernel_threads = device_param->kernel_wgs_init2; - break; - case KERN_RUN_LOOP2: - kernel = device_param->opencl_kernel_loop2; - kernel_threads = device_param->kernel_wgs_loop2; - break; - case KERN_RUN_AUX1: - kernel = device_param->opencl_kernel_aux1; - kernel_threads = device_param->kernel_wgs_aux1; - break; - case KERN_RUN_AUX2: - kernel = device_param->opencl_kernel_aux2; - kernel_threads = device_param->kernel_wgs_aux2; - break; - case KERN_RUN_AUX3: - kernel = device_param->opencl_kernel_aux3; - kernel_threads = device_param->kernel_wgs_aux3; - break; - case KERN_RUN_AUX4: - kernel = device_param->opencl_kernel_aux4; - kernel_threads = device_param->kernel_wgs_aux4; - break; - default: - event_log_error (hashcat_ctx, "Invalid kernel specified."); - return -1; + case KERN_RUN_1: kernel_threads = device_param->kernel_wgs1; break; + case KERN_RUN_12: kernel_threads = device_param->kernel_wgs12; break; + case KERN_RUN_2: kernel_threads = device_param->kernel_wgs2; break; + case KERN_RUN_23: kernel_threads = device_param->kernel_wgs23; break; + case KERN_RUN_3: kernel_threads = device_param->kernel_wgs3; break; + case KERN_RUN_4: kernel_threads = device_param->kernel_wgs4; break; + case KERN_RUN_INIT2: kernel_threads = device_param->kernel_wgs_init2; break; + case KERN_RUN_LOOP2: kernel_threads = device_param->kernel_wgs_loop2; break; + case KERN_RUN_AUX1: kernel_threads = device_param->kernel_wgs_aux1; break; + case KERN_RUN_AUX2: kernel_threads = device_param->kernel_wgs_aux2; break; + case KERN_RUN_AUX3: kernel_threads = device_param->kernel_wgs_aux3; break; + case KERN_RUN_AUX4: kernel_threads = device_param->kernel_wgs_aux4; break; } kernel_threads = MIN (kernel_threads, device_param->kernel_threads); // kernel_threads = power_of_two_floor_32 (kernel_threads); - num_elements = round_up_multiple_64 (num_elements, kernel_threads); + if (device_param->is_cuda == true) + { + num_elements = CEILDIV (num_elements, kernel_threads); + } + + if (device_param->is_opencl == true) + { + num_elements = round_up_multiple_64 (num_elements, kernel_threads); + } int CL_rc; + int CU_rc; - for (u32 i = 0; i <= 23; i++) + if (device_param->is_cuda == true) { - CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_mem), device_param->kernel_params[i]); + CUfunction cuda_function = NULL; - if (CL_rc == -1) return -1; - } - - for (u32 i = 24; i <= 33; i++) - { - CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_uint), device_param->kernel_params[i]); - - if (CL_rc == -1) return -1; - } - - for (u32 i = 34; i <= 34; i++) - { - CL_rc = hc_clSetKernelArg (hashcat_ctx, kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]); - - if (CL_rc == -1) return -1; - } - - cl_event event; - - if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF)) - { - const size_t global_work_size[3] = { num_elements, 32, 1 }; - const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - - CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &event); - - if (CL_rc == -1) return -1; - } - else - { - if (kern_run == KERN_RUN_1) + if (device_param->is_cuda == true) { - if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT) + switch (kern_run) { - num_elements = CEILDIV (num_elements, device_param->vector_width); - } - } - else if (kern_run == KERN_RUN_2) - { - if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP) - { - num_elements = CEILDIV (num_elements, device_param->vector_width); - } - } - else if (kern_run == KERN_RUN_3) - { - if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP) - { - num_elements = CEILDIV (num_elements, device_param->vector_width); + case KERN_RUN_1: cuda_function = device_param->cuda_function1; break; + case KERN_RUN_12: cuda_function = device_param->cuda_function12; break; + case KERN_RUN_2: cuda_function = device_param->cuda_function2; break; + case KERN_RUN_23: cuda_function = device_param->cuda_function23; break; + case KERN_RUN_3: cuda_function = device_param->cuda_function3; break; + case KERN_RUN_4: cuda_function = device_param->cuda_function4; break; + case KERN_RUN_INIT2: cuda_function = device_param->cuda_function_init2; break; + case KERN_RUN_LOOP2: cuda_function = device_param->cuda_function_loop2; break; + case KERN_RUN_AUX1: cuda_function = device_param->cuda_function_aux1; break; + case KERN_RUN_AUX2: cuda_function = device_param->cuda_function_aux2; break; + case KERN_RUN_AUX3: cuda_function = device_param->cuda_function_aux3; break; + case KERN_RUN_AUX4: cuda_function = device_param->cuda_function_aux4; break; } } - num_elements = round_up_multiple_64 (num_elements, kernel_threads); + CUevent cuda_event; - const size_t global_work_size[3] = { num_elements, 1, 1 }; - const size_t local_work_size[3] = { kernel_threads, 1, 1 }; +/* + if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF)) + { + const size_t global_work_size[3] = { num_elements, 32, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &event); + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event); + + if (CL_rc == -1) return -1; + } + else + { + if (kern_run == KERN_RUN_1) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + else if (kern_run == KERN_RUN_2) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + else if (kern_run == KERN_RUN_3) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + + num_elements = round_up_multiple_64 (num_elements, kernel_threads); + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event); + + if (CL_rc == -1) return -1; + } + + CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue); if (CL_rc == -1) return -1; - } - CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue); + // spin damper section - if (CL_rc == -1) return -1; + const u32 iterationm = iteration % EXPECTED_ITERATIONS; - // spin damper section + cl_int opencl_event_status; - const u32 iterationm = iteration % EXPECTED_ITERATIONS; + size_t param_value_size_ret; - cl_int event_status; + CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, ¶m_value_size_ret); - size_t param_value_size_ret; + if (CL_rc == -1) return -1; - CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, ¶m_value_size_ret); + if (device_param->spin_damp > 0) + { + double spin_total = device_param->spin_damp; - if (CL_rc == -1) return -1; + while (opencl_event_status != CL_COMPLETE) + { + if (status_ctx->devices_status == STATUS_RUNNING) + { + switch (kern_run) + { + case KERN_RUN_1: if (device_param->exec_us_prev1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_2: if (device_param->exec_us_prev2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_3: if (device_param->exec_us_prev3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_4: if (device_param->exec_us_prev4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_INIT2: if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_LOOP2: if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX1: if (device_param->exec_us_prev_aux1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX2: if (device_param->exec_us_prev_aux2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX3: if (device_param->exec_us_prev_aux3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX4: if (device_param->exec_us_prev_aux4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm] * device_param->spin_damp)); break; + } + } + else + { + // we were told to be nice - if (device_param->spin_damp > 0) - { - double spin_total = device_param->spin_damp; + sleep (0); + } - while (event_status != CL_COMPLETE) + CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, ¶m_value_size_ret); + + if (CL_rc == -1) return -1; + + spin_total += device_param->spin_damp; + + if (spin_total > 1) break; + } + } + + CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event); + + if (CL_rc == -1) return -1; + + cl_ulong time_start; + cl_ulong time_end; + + CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1; + CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END, sizeof (time_end), &time_end, NULL); if (CL_rc == -1) return -1; + + const double exec_us = (double) (time_end - time_start) / 1000; + + if (device_param->spin_damp > 0) { if (status_ctx->devices_status == STATUS_RUNNING) { switch (kern_run) { - case KERN_RUN_1: if (device_param->exec_us_prev1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_2: if (device_param->exec_us_prev2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_3: if (device_param->exec_us_prev3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_4: if (device_param->exec_us_prev4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_INIT2: if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_LOOP2: if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX1: if (device_param->exec_us_prev_aux1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX2: if (device_param->exec_us_prev_aux2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX3: if (device_param->exec_us_prev_aux3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm] * device_param->spin_damp)); break; - case KERN_RUN_AUX4: if (device_param->exec_us_prev_aux4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_1: device_param->exec_us_prev1[iterationm] = exec_us; break; + case KERN_RUN_2: device_param->exec_us_prev2[iterationm] = exec_us; break; + case KERN_RUN_3: device_param->exec_us_prev3[iterationm] = exec_us; break; + case KERN_RUN_4: device_param->exec_us_prev4[iterationm] = exec_us; break; + case KERN_RUN_INIT2: device_param->exec_us_prev_init2[iterationm] = exec_us; break; + case KERN_RUN_LOOP2: device_param->exec_us_prev_loop2[iterationm] = exec_us; break; + case KERN_RUN_AUX1: device_param->exec_us_prev_aux1[iterationm] = exec_us; break; + case KERN_RUN_AUX2: device_param->exec_us_prev_aux2[iterationm] = exec_us; break; + case KERN_RUN_AUX3: device_param->exec_us_prev_aux3[iterationm] = exec_us; break; + case KERN_RUN_AUX4: device_param->exec_us_prev_aux4[iterationm] = exec_us; break; } } - else - { - // we were told to be nice + } - sleep (0); + if (event_update) + { + u32 exec_pos = device_param->exec_pos; + + device_param->exec_msec[exec_pos] = exec_us / 1000; + + exec_pos++; + + if (exec_pos == EXEC_CACHE) + { + exec_pos = 0; } - CL_rc = hc_clGetEventInfo (hashcat_ctx, event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (event_status), &event_status, ¶m_value_size_ret); - - if (CL_rc == -1) return -1; - - spin_total += device_param->spin_damp; - - if (spin_total > 1) break; + device_param->exec_pos = exec_pos; } + + CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue); + + if (CL_rc == -1) return -1; +*/ } - CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &event); - - if (CL_rc == -1) return -1; - - cl_ulong time_start; - cl_ulong time_end; - - CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1; - CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, event, CL_PROFILING_COMMAND_END, sizeof (time_end), &time_end, NULL); if (CL_rc == -1) return -1; - - const double exec_us = (double) (time_end - time_start) / 1000; - - if (device_param->spin_damp > 0) + if (device_param->is_opencl == true) { - if (status_ctx->devices_status == STATUS_RUNNING) + cl_kernel opencl_kernel = NULL; + + if (device_param->is_opencl == true) { switch (kern_run) { - case KERN_RUN_1: device_param->exec_us_prev1[iterationm] = exec_us; break; - case KERN_RUN_2: device_param->exec_us_prev2[iterationm] = exec_us; break; - case KERN_RUN_3: device_param->exec_us_prev3[iterationm] = exec_us; break; - case KERN_RUN_4: device_param->exec_us_prev4[iterationm] = exec_us; break; - case KERN_RUN_INIT2: device_param->exec_us_prev_init2[iterationm] = exec_us; break; - case KERN_RUN_LOOP2: device_param->exec_us_prev_loop2[iterationm] = exec_us; break; - case KERN_RUN_AUX1: device_param->exec_us_prev_aux1[iterationm] = exec_us; break; - case KERN_RUN_AUX2: device_param->exec_us_prev_aux2[iterationm] = exec_us; break; - case KERN_RUN_AUX3: device_param->exec_us_prev_aux3[iterationm] = exec_us; break; - case KERN_RUN_AUX4: device_param->exec_us_prev_aux4[iterationm] = exec_us; break; + case KERN_RUN_1: opencl_kernel = device_param->opencl_kernel1; break; + case KERN_RUN_12: opencl_kernel = device_param->opencl_kernel12; break; + case KERN_RUN_2: opencl_kernel = device_param->opencl_kernel2; break; + case KERN_RUN_23: opencl_kernel = device_param->opencl_kernel23; break; + case KERN_RUN_3: opencl_kernel = device_param->opencl_kernel3; break; + case KERN_RUN_4: opencl_kernel = device_param->opencl_kernel4; break; + case KERN_RUN_INIT2: opencl_kernel = device_param->opencl_kernel_init2; break; + case KERN_RUN_LOOP2: opencl_kernel = device_param->opencl_kernel_loop2; break; + case KERN_RUN_AUX1: opencl_kernel = device_param->opencl_kernel_aux1; break; + case KERN_RUN_AUX2: opencl_kernel = device_param->opencl_kernel_aux2; break; + case KERN_RUN_AUX3: opencl_kernel = device_param->opencl_kernel_aux3; break; + case KERN_RUN_AUX4: opencl_kernel = device_param->opencl_kernel_aux4; break; } } - } - if (event_update) - { - u32 exec_pos = device_param->exec_pos; - - device_param->exec_msec[exec_pos] = exec_us / 1000; - - exec_pos++; - - if (exec_pos == EXEC_CACHE) + for (u32 i = 0; i <= 23; i++) { - exec_pos = 0; + CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]); + + if (CL_rc == -1) return -1; } - device_param->exec_pos = exec_pos; + for (u32 i = 24; i <= 33; i++) + { + CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_uint), device_param->kernel_params[i]); + + if (CL_rc == -1) return -1; + } + + for (u32 i = 34; i <= 34; i++) + { + CL_rc = hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_ulong), device_param->kernel_params[i]); + + if (CL_rc == -1) return -1; + } + + cl_event opencl_event; + + if ((hashconfig->opts_type & OPTS_TYPE_PT_BITSLICE) && (user_options->attack_mode == ATTACK_MODE_BF)) + { + const size_t global_work_size[3] = { num_elements, 32, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event); + + if (CL_rc == -1) return -1; + } + else + { + if (kern_run == KERN_RUN_1) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_INIT) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + else if (kern_run == KERN_RUN_2) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_LOOP) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + else if (kern_run == KERN_RUN_3) + { + if (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_SIMD_COMP) + { + num_elements = CEILDIV (num_elements, device_param->vector_width); + } + } + + num_elements = round_up_multiple_64 (num_elements, kernel_threads); + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + CL_rc = hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event); + + if (CL_rc == -1) return -1; + } + + CL_rc = hc_clFlush (hashcat_ctx, device_param->opencl_command_queue); + + if (CL_rc == -1) return -1; + + // spin damper section + + const u32 iterationm = iteration % EXPECTED_ITERATIONS; + + cl_int opencl_event_status; + + size_t param_value_size_ret; + + CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, ¶m_value_size_ret); + + if (CL_rc == -1) return -1; + + if (device_param->spin_damp > 0) + { + double spin_total = device_param->spin_damp; + + while (opencl_event_status != CL_COMPLETE) + { + if (status_ctx->devices_status == STATUS_RUNNING) + { + switch (kern_run) + { + case KERN_RUN_1: if (device_param->exec_us_prev1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_2: if (device_param->exec_us_prev2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_3: if (device_param->exec_us_prev3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_4: if (device_param->exec_us_prev4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev4[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_INIT2: if (device_param->exec_us_prev_init2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_init2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_LOOP2: if (device_param->exec_us_prev_loop2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_loop2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX1: if (device_param->exec_us_prev_aux1[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux1[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX2: if (device_param->exec_us_prev_aux2[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux2[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX3: if (device_param->exec_us_prev_aux3[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux3[iterationm] * device_param->spin_damp)); break; + case KERN_RUN_AUX4: if (device_param->exec_us_prev_aux4[iterationm] > 0) usleep ((useconds_t) (device_param->exec_us_prev_aux4[iterationm] * device_param->spin_damp)); break; + } + } + else + { + // we were told to be nice + + sleep (0); + } + + CL_rc = hc_clGetEventInfo (hashcat_ctx, opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof (opencl_event_status), &opencl_event_status, ¶m_value_size_ret); + + if (CL_rc == -1) return -1; + + spin_total += device_param->spin_damp; + + if (spin_total > 1) break; + } + } + + CL_rc = hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event); + + if (CL_rc == -1) return -1; + + cl_ulong time_start; + cl_ulong time_end; + + CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_START, sizeof (time_start), &time_start, NULL); if (CL_rc == -1) return -1; + CL_rc = hc_clGetEventProfilingInfo (hashcat_ctx, opencl_event, CL_PROFILING_COMMAND_END, sizeof (time_end), &time_end, NULL); if (CL_rc == -1) return -1; + + const double exec_us = (double) (time_end - time_start) / 1000; + + if (device_param->spin_damp > 0) + { + if (status_ctx->devices_status == STATUS_RUNNING) + { + switch (kern_run) + { + case KERN_RUN_1: device_param->exec_us_prev1[iterationm] = exec_us; break; + case KERN_RUN_2: device_param->exec_us_prev2[iterationm] = exec_us; break; + case KERN_RUN_3: device_param->exec_us_prev3[iterationm] = exec_us; break; + case KERN_RUN_4: device_param->exec_us_prev4[iterationm] = exec_us; break; + case KERN_RUN_INIT2: device_param->exec_us_prev_init2[iterationm] = exec_us; break; + case KERN_RUN_LOOP2: device_param->exec_us_prev_loop2[iterationm] = exec_us; break; + case KERN_RUN_AUX1: device_param->exec_us_prev_aux1[iterationm] = exec_us; break; + case KERN_RUN_AUX2: device_param->exec_us_prev_aux2[iterationm] = exec_us; break; + case KERN_RUN_AUX3: device_param->exec_us_prev_aux3[iterationm] = exec_us; break; + case KERN_RUN_AUX4: device_param->exec_us_prev_aux4[iterationm] = exec_us; break; + } + } + } + + if (event_update) + { + u32 exec_pos = device_param->exec_pos; + + device_param->exec_msec[exec_pos] = exec_us / 1000; + + exec_pos++; + + if (exec_pos == EXEC_CACHE) + { + exec_pos = 0; + } + + device_param->exec_pos = exec_pos; + } + + CL_rc = hc_clReleaseEvent (hashcat_ctx, opencl_event); + + if (CL_rc == -1) return -1; + + CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue); + + if (CL_rc == -1) return -1; } - CL_rc = hc_clReleaseEvent (hashcat_ctx, event); - - if (CL_rc == -1) return -1; - - CL_rc = hc_clFinish (hashcat_ctx, device_param->opencl_command_queue); - - if (CL_rc == -1) return -1; - return 0; } @@ -9705,6 +10042,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_cuda == true) { + device_param->kernel_params[ 0] = &device_param->cuda_d_pws_buf; + device_param->kernel_params[ 4] = &device_param->cuda_d_tmps; + device_param->kernel_params[ 5] = &device_param->cuda_d_hooks; } if (device_param->is_opencl == true) @@ -9729,6 +10069,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) { if (device_param->is_cuda == true) { + device_param->kernel_params_mp[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->cuda_d_pws_buf + : &device_param->cuda_d_pws_amp_buf; + + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp, 0, sizeof (cl_mem), device_param->kernel_params_mp[0]); if (CL_rc == -1) return -1; } if (device_param->is_opencl == true) @@ -9746,6 +10091,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) { if (device_param->is_cuda == true) { + device_param->kernel_params_mp_l[0] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->cuda_d_pws_buf + : &device_param->cuda_d_pws_amp_buf; + + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_mp_l, 0, sizeof (cl_mem), device_param->kernel_params_mp_l[0]); if (CL_rc == -1) return -1; } if (device_param->is_opencl == true) @@ -9766,6 +10116,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) { if (device_param->is_cuda == true) { + device_param->kernel_params_amp[0] = &device_param->cuda_d_pws_buf; + device_param->kernel_params_amp[1] = &device_param->cuda_d_pws_amp_buf; + + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 0, sizeof (cl_mem), device_param->kernel_params_amp[0]); if (CL_rc == -1) return -1; + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_amp, 1, sizeof (cl_mem), device_param->kernel_params_amp[1]); if (CL_rc == -1) return -1; } if (device_param->is_opencl == true) @@ -9781,6 +10136,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_cuda == true) { + device_param->kernel_params_decompress[0] = &device_param->cuda_d_pws_idx; + device_param->kernel_params_decompress[1] = &device_param->cuda_d_pws_comp_buf; + device_param->kernel_params_decompress[2] = (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + ? &device_param->cuda_d_pws_buf + : &device_param->cuda_d_pws_amp_buf; + + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 0, sizeof (cl_mem), device_param->kernel_params_decompress[0]); if (CL_rc == -1) return -1; + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 1, sizeof (cl_mem), device_param->kernel_params_decompress[1]); if (CL_rc == -1) return -1; + //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_decompress, 2, sizeof (cl_mem), device_param->kernel_params_decompress[2]); if (CL_rc == -1) return -1; } if (device_param->is_opencl == true)