From a2a1d04bcf66f9474d0d336b7d7c3f03fd30b6a7 Mon Sep 17 00:00:00 2001 From: Jukka Ojanen Date: Sat, 17 Jul 2021 19:00:10 +0300 Subject: [PATCH] Implement gpu_bzero --- OpenCL/shared.cl | 32 +++++++- include/types.h | 7 ++ src/backend.c | 186 +++++++++++++++++++++++++++++++++-------------- 3 files changed, 168 insertions(+), 57 deletions(-) diff --git a/OpenCL/shared.cl b/OpenCL/shared.cl index 3cc96e79f..fe4ff4087 100644 --- a/OpenCL/shared.cl +++ b/OpenCL/shared.cl @@ -117,10 +117,7 @@ KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_ #if defined IS_NATIVE r = value; #elif defined IS_OPENCL - r.s0 = value; - r.s1 = value; - r.s2 = value; - r.s3 = value; + r = (uint4) (value); #elif defined IS_CUDA r.x = value; r.y = value; @@ -136,6 +133,33 @@ KERNEL_FQ void gpu_memset (GLOBAL_AS uint4 *buf, const u32 value, const u64 gid_ buf[gid] = r; } +KERNEL_FQ void gpu_bzero(GLOBAL_AS uint4* buf, const u64 gid_max) +{ + const u64 gid = get_global_id(0); + + if (gid >= gid_max) return; + + uint4 r; + + #if defined IS_NATIVE + r = 0; + #elif defined IS_OPENCL + r = (uint4) (0); + #elif defined IS_CUDA + r.x = 0; + r.y = 0; + r.z = 0; + r.w = 0; + #elif defined IS_HIP + r.x = 0; + r.y = 0; + r.z = 0; + r.w = 0; + #endif + + buf[gid] = r; +} + KERNEL_FQ void gpu_atinit (GLOBAL_AS pw_t *buf, const u64 gid_max) { const u64 gid = get_global_id (0); diff --git a/include/types.h b/include/types.h index f68d3050c..6c48abeaf 100644 --- a/include/types.h +++ b/include/types.h @@ -1137,6 +1137,7 @@ typedef struct hc_device_param u32 kernel_wgs_amp; u32 kernel_wgs_tm; u32 kernel_wgs_memset; + u32 kernel_wgs_bzero; u32 kernel_wgs_atinit; u32 kernel_wgs_utf8toutf16le; u32 kernel_wgs_decompress; @@ -1373,6 +1374,7 @@ typedef struct hc_device_param void *kernel_params_amp[PARAMCNT]; void *kernel_params_tm[PARAMCNT]; void *kernel_params_memset[PARAMCNT]; + void *kernel_params_bzero[PARAMCNT]; void *kernel_params_atinit[PARAMCNT]; void *kernel_params_utf8toutf16le[PARAMCNT]; void *kernel_params_decompress[PARAMCNT]; @@ -1395,6 +1397,8 @@ typedef struct hc_device_param u32 kernel_params_memset_buf32[PARAMCNT]; u64 kernel_params_memset_buf64[PARAMCNT]; + u64 kernel_params_bzero_buf64[PARAMCNT]; + u32 kernel_params_atinit_buf32[PARAMCNT]; u64 kernel_params_atinit_buf64[PARAMCNT]; @@ -1439,6 +1443,7 @@ typedef struct hc_device_param CUfunction cuda_function_amp; CUfunction cuda_function_tm; CUfunction cuda_function_memset; + CUfunction cuda_function_bzero; CUfunction cuda_function_atinit; CUfunction cuda_function_utf8toutf16le; CUfunction cuda_function_decompress; @@ -1519,6 +1524,7 @@ typedef struct hc_device_param HIPfunction hip_function_amp; HIPfunction hip_function_tm; HIPfunction hip_function_memset; + HIPfunction hip_function_bzero; HIPfunction hip_function_atinit; HIPfunction hip_function_utf8toutf16le; HIPfunction hip_function_decompress; @@ -1604,6 +1610,7 @@ typedef struct hc_device_param cl_kernel opencl_kernel_amp; cl_kernel opencl_kernel_tm; cl_kernel opencl_kernel_memset; + cl_kernel opencl_kernel_bzero; cl_kernel opencl_kernel_atinit; cl_kernel opencl_kernel_utf8toutf16le; cl_kernel opencl_kernel_decompress; diff --git a/src/backend.c b/src/backend.c index 4caff74ce..862dc85bd 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5018,19 +5018,10 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic const u64 kernel_threads = device_param->kernel_wgs_memset; - u64 num_elements = num16d; - - num_elements = CEILDIV (num_elements, kernel_threads); + u64 num_elements = CEILDIV (num16d, kernel_threads); CUfunction function = device_param->cuda_function_memset; - //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf); if (CU_rc == -1) return -1; - //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CU_rc == -1) return -1; - //CU_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CU_rc == -1) return -1; - - //const size_t global_work_size[3] = { num_elements, 1, 1 }; - //const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_memset, NULL) == -1) return -1; if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; @@ -5055,7 +5046,38 @@ int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size) { - return run_cuda_kernel_memset (hashcat_ctx, device_param, buf, 0, size); + const u64 num16d = size / 16; + const u64 num16m = size % 16; + + if (num16d) + { + device_param->kernel_params_bzero[0] = (void *) &buf; + device_param->kernel_params_bzero_buf64[1] = num16d; + + const u64 kernel_threads = device_param->kernel_wgs_bzero; + + u64 num_elements = CEILDIV (num16d, kernel_threads); + + CUfunction function = device_param->cuda_function_bzero; + + if (hc_cuLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->cuda_stream, device_param->kernel_params_bzero, NULL) == -1) return -1; + + if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1; + } + + if (num16m) + { + u32 tmp[4]; + + tmp[0] = 0; + tmp[1] = 0; + tmp[2] = 0; + tmp[3] = 0; + + if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1; + } + + return 0; } int run_hip_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 num) @@ -5111,19 +5133,10 @@ int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device const u64 kernel_threads = device_param->kernel_wgs_memset; - u64 num_elements = num16d; - - num_elements = CEILDIV (num_elements, kernel_threads); + u64 num_elements = CEILDIV (num16d, kernel_threads); HIPfunction function = device_param->hip_function_memset; - //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf); if (HIP_rc == -1) return -1; - //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (HIP_rc == -1) return -1; - //HIP_rc = hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (HIP_rc == -1) return -1; - - //const size_t global_work_size[3] = { num_elements, 1, 1 }; - //const size_t local_work_size[3] = { kernel_threads, 1, 1 }; - if (hc_hipLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_memset, NULL) == -1) return -1; if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1; @@ -5138,8 +5151,6 @@ int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device tmp[2] = value; tmp[3] = value; - // Apparently are allowed to do this: https://devtalk.nvidia.com/default/topic/761515/how-to-copy-to-device-memory-with-offset-/ - if (hc_hipMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1; } @@ -5148,7 +5159,38 @@ int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, HIPdeviceptr buf, const u64 size) { - return run_hip_kernel_memset (hashcat_ctx, device_param, buf, 0, size); + const u64 num16d = size / 16; + const u64 num16m = size % 16; + + if (num16d) + { + device_param->kernel_params_bzero[0] = (void *) &buf; + device_param->kernel_params_bzero_buf64[1] = num16d; + + const u64 kernel_threads = device_param->kernel_wgs_bzero; + + u64 num_elements = CEILDIV(num16d, kernel_threads); + + HIPfunction function = device_param->hip_function_bzero; + + if (hc_hipLaunchKernel (hashcat_ctx, function, num_elements, 1, 1, kernel_threads, 1, 1, 0, device_param->hip_stream, device_param->kernel_params_bzero, NULL) == -1) return -1; + + if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1; + } + + if (num16m) + { + u32 tmp[4]; + + tmp[0] = 0; + tmp[1] = 0; + tmp[2] = 0; + tmp[3] = 0; + + if (hc_hipMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), tmp, num16m) == -1) return -1; + } + + return 0; } int run_opencl_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 num) @@ -5211,52 +5253,77 @@ int run_opencl_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *dev { const u64 num16d = size / 16; const u64 num16m = size % 16; + u32 tmp[4]; if (num16d) { - device_param->kernel_params_memset_buf32[1] = value; - device_param->kernel_params_memset_buf64[2] = num16d; - const u64 kernel_threads = device_param->kernel_wgs_memset; - u64 num_elements = num16d; - - num_elements = round_up_multiple_64 (num_elements, kernel_threads); + u64 num_elements = round_up_multiple_64 (num16d, kernel_threads); cl_kernel kernel = device_param->opencl_kernel_memset; - if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof (cl_mem), (void *) &buf) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof(cl_mem), (void *) &buf) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof(cl_uint), (void *) &value) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, kernel, 2, sizeof(cl_ulong), (void *) &num16d) == -1) return -1; const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1; - - if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; - - if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; } if (num16m) { - u32 tmp[4]; - tmp[0] = value; tmp[1] = value; tmp[2] = value; tmp[3] = value; - if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL) == -1) return -1; + if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_FALSE, num16d * 16, num16m, tmp, 0, NULL, NULL) == -1) return -1; } + if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; + return 0; } int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, cl_mem buf, const u64 size) { - return run_opencl_kernel_memset (hashcat_ctx, device_param, buf, 0, size); + const u64 num16d = size / 16; + const u64 num16m = size % 16; + u32 tmp[4]; + + if (num16d) + { + const u64 kernel_threads = device_param->kernel_wgs_bzero; + + u64 num_elements = round_up_multiple_64(num16d, kernel_threads); + + cl_kernel kernel = device_param->opencl_kernel_bzero; + + if (hc_clSetKernelArg (hashcat_ctx, kernel, 0, sizeof(cl_mem), (void *) &buf) == -1) return -1; + if (hc_clSetKernelArg (hashcat_ctx, kernel, 1, sizeof(cl_ulong), (void *) &num16d) == -1) return -1; + + const size_t global_work_size[3] = { num_elements, 1, 1 }; + const size_t local_work_size[3] = { kernel_threads, 1, 1 }; + + if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL) == -1) return -1; + } + + if (num16m) + { + tmp[0] = 0; + tmp[1] = 0; + tmp[2] = 0; + tmp[3] = 0; + + if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_FALSE, num16d * 16, num16m, tmp, 0, NULL, NULL) == -1) return -1; + } + + if (hc_clFinish (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1; + + return 0; } int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kern_run, const u64 pws_pos, const u64 num, const u32 event_update, const u32 iteration) @@ -11689,9 +11756,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_preferred_wgs_multiple_memset = device_param->cuda_warp_size; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1; + // GPU bzero + + if (hc_cuModuleGetFunction (hashcat_ctx, &device_param->cuda_function_bzero, device_param->cuda_module_shared, "gpu_bzero") == -1) return -1; + + if (get_cuda_kernel_wgs (hashcat_ctx, device_param->cuda_function_bzero, &device_param->kernel_wgs_bzero) == -1) return -1; // GPU autotune init @@ -11747,9 +11816,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_preferred_wgs_multiple_memset = device_param->hip_warp_size; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]); if (CL_rc == -1) return -1; - //CL_rc = hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]); if (CL_rc == -1) return -1; + // GPU bzero + + if (hc_hipModuleGetFunction (hashcat_ctx, &device_param->hip_function_bzero, device_param->hip_module_shared, "gpu_bzero") == -1) return -1; + + if (get_hip_kernel_wgs (hashcat_ctx, device_param->hip_function_bzero, &device_param->kernel_wgs_bzero) == -1) return -1; // GPU autotune init @@ -11805,6 +11876,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (get_opencl_kernel_preferred_wgs_multiple (hashcat_ctx, device_param, device_param->opencl_kernel_memset, &device_param->kernel_preferred_wgs_multiple_memset) == -1) return -1; + // GPU bzero + + if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_bzero", &device_param->opencl_kernel_bzero) == -1) return -1; + + if (get_opencl_kernel_wgs (hashcat_ctx, device_param, device_param->opencl_kernel_bzero, &device_param->kernel_wgs_bzero) == -1) return -1; + // GPU autotune init if (hc_clCreateKernel (hashcat_ctx, device_param->opencl_program_shared, "gpu_atinit", &device_param->opencl_kernel_atinit) == -1) return -1; @@ -12745,6 +12822,11 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1]; device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf64[2]; + device_param->kernel_params_bzero_buf64[1] = 0; // gid_max + + device_param->kernel_params_bzero[0] = NULL; + device_param->kernel_params_bzero[1] = &device_param->kernel_params_bzero_buf64[1]; + device_param->kernel_params_atinit_buf64[1] = 0; // gid_max device_param->kernel_params_atinit[0] = NULL; @@ -13996,12 +14078,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (device_param->is_opencl == true) { - // GPU memset - - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]) == -1) return -1; - if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_memset, 2, sizeof (cl_ulong), device_param->kernel_params_memset[2]) == -1) return -1; - // GPU autotune init if (hc_clSetKernelArg (hashcat_ctx, device_param->opencl_kernel_atinit, 0, sizeof (cl_mem), device_param->kernel_params_atinit[0]) == -1) return -1; @@ -15289,6 +15365,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->cuda_function_tm = NULL; device_param->cuda_function_amp = NULL; device_param->cuda_function_memset = NULL; + device_param->cuda_function_bzero = NULL; device_param->cuda_function_atinit = NULL; device_param->cuda_function_utf8toutf16le = NULL; device_param->cuda_function_decompress = NULL; @@ -15415,6 +15492,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->hip_function_tm = NULL; device_param->hip_function_amp = NULL; device_param->hip_function_memset = NULL; + device_param->hip_function_bzero = NULL; device_param->hip_function_atinit = NULL; device_param->hip_function_utf8toutf16le = NULL; device_param->hip_function_decompress = NULL; @@ -15492,6 +15570,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) if (device_param->opencl_kernel_tm) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_tm); if (device_param->opencl_kernel_amp) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_amp); if (device_param->opencl_kernel_memset) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_memset); + if (device_param->opencl_kernel_bzero) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_bzero); if (device_param->opencl_kernel_atinit) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_atinit); if (device_param->opencl_kernel_utf8toutf16le) hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_utf8toutf16le); if (device_param->opencl_kernel_decompress)hc_clReleaseKernel (hashcat_ctx, device_param->opencl_kernel_decompress); @@ -15562,6 +15641,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx) device_param->opencl_kernel_tm = NULL; device_param->opencl_kernel_amp = NULL; device_param->opencl_kernel_memset = NULL; + device_param->opencl_kernel_bzero = NULL; device_param->opencl_kernel_atinit = NULL; device_param->opencl_kernel_utf8toutf16le = NULL; device_param->opencl_kernel_decompress = NULL;