mirror of
https://github.com/hashcat/hashcat.git
synced 2025-07-04 22:02:36 +00:00
OpenCL Backend: added workaround to set device_available_memory from CUDA/HIP alias device
This commit is contained in:
parent
2af580b448
commit
12f1fe56aa
@ -130,6 +130,7 @@
|
|||||||
- Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
|
- Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
|
||||||
- Modules: Updated module_unstable_warning
|
- Modules: Updated module_unstable_warning
|
||||||
- Open Document Format: Added support for small documents with content length < 1024
|
- Open Document Format: Added support for small documents with content length < 1024
|
||||||
|
- OpenCL Backend: added workaround to set device_available_memory from CUDA/HIP alias device
|
||||||
- Status Code: Add specific return code for self-test fail (-11)
|
- Status Code: Add specific return code for self-test fail (-11)
|
||||||
- Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
|
- Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
|
||||||
- Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
|
- Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
|
||||||
|
233
src/backend.c
233
src/backend.c
@ -8362,114 +8362,165 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
// available device memory
|
// available device memory
|
||||||
// This test causes an GPU memory usage spike.
|
// first trying to check if we can get device_available_mem from cuda/hip alias device
|
||||||
// In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist.
|
|
||||||
// We will simply not run it if that device was skipped by the user.
|
|
||||||
|
|
||||||
if (device_param->device_global_mem)
|
bool updated_device_available_mem = false;
|
||||||
|
|
||||||
|
if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
|
||||||
{
|
{
|
||||||
#define MAX_ALLOC_CHECKS_CNT 8192
|
if (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)
|
||||||
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
|
|
||||||
|
|
||||||
device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
|
|
||||||
|
|
||||||
if (user_options->backend_devices_keepfree)
|
|
||||||
{
|
{
|
||||||
device_param->device_available_mem = (device_param->device_global_mem * (100 - user_options->backend_devices_keepfree)) / 100;
|
if (backend_ctx->cuda_devices_cnt > 0 && backend_ctx->cuda_devices_active > 0)
|
||||||
|
{
|
||||||
|
for (int cuda_devices_idx = 0; cuda_devices_idx < backend_ctx->cuda_devices_cnt; cuda_devices_idx++)
|
||||||
|
{
|
||||||
|
const int tmp_backend_devices_idx = backend_ctx->backend_device_from_cuda[cuda_devices_idx];
|
||||||
|
|
||||||
|
hc_device_param_t *tmp_device_param = backend_ctx->devices_param + tmp_backend_devices_idx;
|
||||||
|
|
||||||
|
if (is_same_device (device_param, tmp_device_param))
|
||||||
|
{
|
||||||
|
device_param->device_available_mem = tmp_device_param->device_available_mem;
|
||||||
|
updated_device_available_mem = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// this section is creating more problems than it solves, so lets use a fixed multiplier instead
|
else if (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)
|
||||||
// users can override with --backend-devices-keepfree=0
|
|
||||||
else if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == 0))
|
|
||||||
{
|
{
|
||||||
// OK, so the problem here is the following:
|
if (backend_ctx->hip_devices_cnt > 0 && backend_ctx->hip_devices_active > 0)
|
||||||
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
|
|
||||||
// but there's no way to ask for available memory on the device.
|
|
||||||
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
|
|
||||||
// are doing so called lazy memory allocation on the device.
|
|
||||||
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
|
|
||||||
// running on the host we end up with an error type of this:
|
|
||||||
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
|
|
||||||
// The clEnqueueNDRangeKernel() is because of the lazy allocation
|
|
||||||
// The best way to workaround this problem is if we would be able to ask for available memory,
|
|
||||||
// The idea here is to try to evaluate available memory by allocating it till it errors
|
|
||||||
|
|
||||||
cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
|
|
||||||
|
|
||||||
u64 c;
|
|
||||||
|
|
||||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
|
||||||
{
|
{
|
||||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
for (int hip_devices_idx = 0; hip_devices_idx < backend_ctx->hip_devices_cnt; hip_devices_idx++)
|
||||||
|
|
||||||
// work around, for some reason apple opencl can't have buffers larger 2^31
|
|
||||||
// typically runs into trap 6
|
|
||||||
// maybe 32/64 bit problem affecting size_t?
|
|
||||||
// this seems to affect global memory as well no just single allocations
|
|
||||||
|
|
||||||
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->is_metal == false))
|
|
||||||
{
|
{
|
||||||
const size_t undocumented_single_allocation_apple = 0x7fffffff;
|
const int tmp_backend_devices_idx = backend_ctx->backend_device_from_hip[hip_devices_idx];
|
||||||
|
|
||||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= undocumented_single_allocation_apple) break;
|
hc_device_param_t *tmp_device_param = backend_ctx->devices_param + tmp_backend_devices_idx;
|
||||||
}
|
|
||||||
|
|
||||||
cl_int CL_err;
|
if (is_same_device (device_param, tmp_device_param))
|
||||||
|
{
|
||||||
OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
|
device_param->device_available_mem = tmp_device_param->device_available_mem;
|
||||||
|
updated_device_available_mem = true;
|
||||||
tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
|
break;
|
||||||
|
}
|
||||||
if (CL_err != CL_SUCCESS)
|
|
||||||
{
|
|
||||||
c--;
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// transfer only a few byte should be enough to force the runtime to actually allocate the memory
|
|
||||||
|
|
||||||
u8 tmp_host[8];
|
|
||||||
|
|
||||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
|
||||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
|
||||||
|
|
||||||
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
|
||||||
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
|
|
||||||
|
|
||||||
if (c > 0)
|
|
||||||
{
|
|
||||||
device_param->device_available_mem *= c;
|
|
||||||
}
|
|
||||||
|
|
||||||
// clean up
|
|
||||||
|
|
||||||
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
|
||||||
{
|
|
||||||
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
|
||||||
|
|
||||||
if (tmp_device[c] != NULL)
|
|
||||||
{
|
|
||||||
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
hcfree (tmp_device);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
|
// if not found ... use old strategy
|
||||||
|
|
||||||
hc_clReleaseContext (hashcat_ctx, context);
|
if (updated_device_available_mem == false)
|
||||||
|
|
||||||
if (device_param->device_host_unified_memory == 1)
|
|
||||||
{
|
{
|
||||||
// so, we actually have only half the memory because we need the same buffers on host side
|
// This test causes an GPU memory usage spike.
|
||||||
|
// In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist.
|
||||||
|
// We will simply not run it if that device was skipped by the user.
|
||||||
|
|
||||||
device_param->device_available_mem /= 2;
|
if (device_param->device_global_mem)
|
||||||
|
{
|
||||||
|
#define MAX_ALLOC_CHECKS_CNT 8192
|
||||||
|
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
|
||||||
|
|
||||||
|
device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
|
||||||
|
|
||||||
|
if (user_options->backend_devices_keepfree)
|
||||||
|
{
|
||||||
|
device_param->device_available_mem = (device_param->device_global_mem * (100 - user_options->backend_devices_keepfree)) / 100;
|
||||||
|
}
|
||||||
|
// this section is creating more problems than it solves, so lets use a fixed multiplier instead
|
||||||
|
// users can override with --backend-devices-keepfree=0
|
||||||
|
else if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == 0))
|
||||||
|
{
|
||||||
|
// OK, so the problem here is the following:
|
||||||
|
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
|
||||||
|
// but there's no way to ask for available memory on the device.
|
||||||
|
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
|
||||||
|
// are doing so called lazy memory allocation on the device.
|
||||||
|
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
|
||||||
|
// running on the host we end up with an error type of this:
|
||||||
|
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
|
||||||
|
// The clEnqueueNDRangeKernel() is because of the lazy allocation
|
||||||
|
// The best way to workaround this problem is if we would be able to ask for available memory,
|
||||||
|
// The idea here is to try to evaluate available memory by allocating it till it errors
|
||||||
|
|
||||||
|
cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
|
||||||
|
|
||||||
|
u64 c;
|
||||||
|
|
||||||
|
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||||
|
{
|
||||||
|
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||||
|
|
||||||
|
// work around, for some reason apple opencl can't have buffers larger 2^31
|
||||||
|
// typically runs into trap 6
|
||||||
|
// maybe 32/64 bit problem affecting size_t?
|
||||||
|
// this seems to affect global memory as well no just single allocations
|
||||||
|
|
||||||
|
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->is_metal == false))
|
||||||
|
{
|
||||||
|
const size_t undocumented_single_allocation_apple = 0x7fffffff;
|
||||||
|
|
||||||
|
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= undocumented_single_allocation_apple) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int CL_err;
|
||||||
|
|
||||||
|
OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
|
||||||
|
|
||||||
|
tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
|
||||||
|
|
||||||
|
if (CL_err != CL_SUCCESS)
|
||||||
|
{
|
||||||
|
c--;
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// transfer only a few byte should be enough to force the runtime to actually allocate the memory
|
||||||
|
|
||||||
|
u8 tmp_host[8];
|
||||||
|
|
||||||
|
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||||
|
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||||
|
|
||||||
|
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||||
|
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
|
||||||
|
|
||||||
|
if (c > 0)
|
||||||
|
{
|
||||||
|
device_param->device_available_mem *= c;
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
|
||||||
|
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
|
||||||
|
{
|
||||||
|
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
|
||||||
|
|
||||||
|
if (tmp_device[c] != NULL)
|
||||||
|
{
|
||||||
|
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hcfree (tmp_device);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
|
||||||
|
|
||||||
|
hc_clReleaseContext (hashcat_ctx, context);
|
||||||
|
|
||||||
|
if (device_param->device_host_unified_memory == 1)
|
||||||
|
{
|
||||||
|
// so, we actually have only half the memory because we need the same buffers on host side
|
||||||
|
|
||||||
|
device_param->device_available_mem /= 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user