From b1ca2ca539472ec0939e77082fb199c2a9e5010f Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 11 Apr 2023 10:17:32 +0000 Subject: [PATCH] Only try to allocate memory on a opencl device if it actually has memory --- include/backend.h | 2 + src/backend.c | 129 ++++++++++++++++++++++++---------------------- 2 files changed, 68 insertions(+), 63 deletions(-) diff --git a/include/backend.h b/include/backend.h index 3ab906e94..f78a3aff4 100644 --- a/include/backend.h +++ b/include/backend.h @@ -48,6 +48,8 @@ void generate_cached_kernel_mp_filename (const u32 opti_type, const u64 opts void generate_source_kernel_amp_filename (const u32 attack_kern, char *shared_dir, char *source_file); void generate_cached_kernel_amp_filename (const u32 attack_kern, char *cache_dir, const char *device_name_chksum, char *cached_file, bool is_metal); +bool read_kernel_binary (hashcat_ctx_t *hashcat_ctx, const char *kernel_file, size_t *kernel_lengths, char **kernel_sources); + int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u64 gidd, pw_t *pw); int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u64 gidd, const u64 cnt, pw_idx_t *dest); diff --git a/src/backend.c b/src/backend.c index 1d73c0590..010e16e26 100644 --- a/src/backend.c +++ b/src/backend.c @@ -528,7 +528,7 @@ static bool opencl_test_instruction (hashcat_ctx_t *hashcat_ctx, cl_context cont return true; } -static bool read_kernel_binary (hashcat_ctx_t *hashcat_ctx, const char *kernel_file, size_t *kernel_lengths, char **kernel_sources) +bool read_kernel_binary (hashcat_ctx_t *hashcat_ctx, const char *kernel_file, size_t *kernel_lengths, char **kernel_sources) { HCFILE fp; @@ -6377,11 +6377,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) // try CL_DEVICE_BOARD_NAME_AMD first, if it fails fall back to CL_DEVICE_NAME // since AMD ROCm does not identify itself at this stage we simply check for return code from clGetDeviceInfo() - #define CHECK_BOARD_NAME_AMD 1 - cl_int rc_board_name_amd = CL_INVALID_VALUE; - if (CHECK_BOARD_NAME_AMD) + if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { //backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx; @@ -7869,6 +7867,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) backend_ctx->opencl_devices_active--; backend_ctx->backend_devices_active--; + continue; } @@ -7884,6 +7883,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) backend_ctx->opencl_devices_active--; backend_ctx->backend_devices_active--; + continue; } @@ -7979,94 +7979,97 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) */ } - // available device memory + // available device memory // This test causes an GPU memory usage spike. // In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist. // We will simply not run it if that device was skipped by the user. - #define MAX_ALLOC_CHECKS_CNT 8192 - #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024) - - device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE; - - if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) + if (device_param->device_global_mem) { - // OK, so the problem here is the following: - // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device, - // but there's no way to ask for available memory on the device. - // In combination, most OpenCL runtimes implementation of clCreateBuffer() - // are doing so called lazy memory allocation on the device. - // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory) - // running on the host we end up with an error type of this: - // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE - // The clEnqueueNDRangeKernel() is because of the lazy allocation - // The best way to workaround this problem is if we would be able to ask for available memory, - // The idea here is to try to evaluate available memory by allocating it till it errors - - cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem)); + #define MAX_ALLOC_CHECKS_CNT 8192 + #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024) - u64 c; + device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE; - for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) + if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { - if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + // OK, so the problem here is the following: + // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device, + // but there's no way to ask for available memory on the device. + // In combination, most OpenCL runtimes implementation of clCreateBuffer() + // are doing so called lazy memory allocation on the device. + // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory) + // running on the host we end up with an error type of this: + // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE + // The clEnqueueNDRangeKernel() is because of the lazy allocation + // The best way to workaround this problem is if we would be able to ask for available memory, + // The idea here is to try to evaluate available memory by allocating it till it errors - // work around, for some reason apple opencl can't have buffers larger 2^31 - // typically runs into trap 6 - // maybe 32/64 bit problem affecting size_t? - // this seems to affect global memory as well no just single allocations + cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem)); - if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->is_metal == false)) + u64 c; + + for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) { - const size_t undocumented_single_allocation_apple = 0x7fffffff; + if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; - if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= undocumented_single_allocation_apple) break; - } + // work around, for some reason apple opencl can't have buffers larger 2^31 + // typically runs into trap 6 + // maybe 32/64 bit problem affecting size_t? + // this seems to affect global memory as well no just single allocations - cl_int CL_err; + if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->is_metal == false)) + { + const size_t undocumented_single_allocation_apple = 0x7fffffff; - OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; + if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= undocumented_single_allocation_apple) break; + } - tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err); + cl_int CL_err; - if (CL_err != CL_SUCCESS) - { - c--; + OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; - break; - } + tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err); - // transfer only a few byte should be enough to force the runtime to actually allocate the memory + if (CL_err != CL_SUCCESS) + { + c--; - u8 tmp_host[8]; + break; + } - if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + // transfer only a few byte should be enough to force the runtime to actually allocate the memory - if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - } + u8 tmp_host[8]; - device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE; + if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - if (c > 0) - { - device_param->device_available_mem *= c; - } + if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + } - // clean up + device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE; - for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) - { - if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + if (c > 0) + { + device_param->device_available_mem *= c; + } - if (tmp_device[c] != NULL) + // clean up + + for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) { - if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1; + if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + + if (tmp_device[c] != NULL) + { + if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1; + } } - } - hcfree (tmp_device); + hcfree (tmp_device); + } } hc_clReleaseCommandQueue (hashcat_ctx, command_queue);