Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time

pull/2519/head
Jens Steube 4 years ago
parent 6d5e1d3e5d
commit e21463da4b

@ -12,6 +12,13 @@
- Fixed too early execution of some module functions which could make use of non-final values opts_type and opti_type - Fixed too early execution of some module functions which could make use of non-final values opts_type and opti_type
- Fixed internal access on module option attribute OPTS_TYPE_SUGGEST_KG with the result that it was unused - Fixed internal access on module option attribute OPTS_TYPE_SUGGEST_KG with the result that it was unused
- Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time
##
## Improvements
##
- Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices
* changes v6.1.0 -> v6.1.1 * changes v6.1.0 -> v6.1.1

@ -5540,7 +5540,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->skipped = true; device_param->skipped = true;
} }
// some attributes have to be hardcoded because they are used for instance in the build options // some attributes have to be hardcoded values because they are used for instance in the build options
device_param->device_local_mem_type = CL_LOCAL; device_param->device_local_mem_type = CL_LOCAL;
device_param->opencl_device_type = CL_DEVICE_TYPE_GPU; device_param->opencl_device_type = CL_DEVICE_TYPE_GPU;
@ -5616,11 +5616,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
cuda_devices_active++; cuda_devices_active++;
} }
CUcontext cuda_context; // instruction set
if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
// bcrypt optimization? // bcrypt optimization?
//const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED); //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED);
@ -5638,46 +5634,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->has_mov64 = (sm >= 10) ? true : false; device_param->has_mov64 = (sm >= 10) ? true : false;
device_param->has_prmt = (sm >= 20) ? true : false; device_param->has_prmt = (sm >= 20) ? true : false;
/* // device_available_mem
#define RUN_INSTRUCTION_CHECKS() \
device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
if (backend_devices_idx > 0)
{
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
if (is_same_device_type (device_param, device_param_prev) == true)
{
device_param->has_add = device_param_prev->has_add;
device_param->has_addc = device_param_prev->has_addc;
device_param->has_sub = device_param_prev->has_sub;
device_param->has_subc = device_param_prev->has_subc;
device_param->has_bfe = device_param_prev->has_bfe;
device_param->has_lop3 = device_param_prev->has_lop3;
device_param->has_mov64 = device_param_prev->has_mov64;
device_param->has_prmt = device_param_prev->has_prmt;
}
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
#undef RUN_INSTRUCTION_CHECKS CUcontext cuda_context;
*/
// device_available_mem if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
size_t free = 0; size_t free = 0;
size_t total = 0; size_t total = 0;
@ -6269,6 +6232,25 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
} }
} }
// instruction set
// fixed values works only for nvidia devices
// dynamical values for amd see time intensive section below
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
{
const int sm = (device_param->sm_major * 10) + device_param->sm_minor;
device_param->has_add = (sm >= 12) ? true : false;
device_param->has_addc = (sm >= 12) ? true : false;
device_param->has_sub = (sm >= 12) ? true : false;
device_param->has_subc = (sm >= 12) ? true : false;
device_param->has_bfe = (sm >= 20) ? true : false;
device_param->has_lop3 = (sm >= 50) ? true : false;
device_param->has_mov64 = (sm >= 10) ? true : false;
device_param->has_prmt = (sm >= 20) ? true : false;
}
// common driver check // common driver check
if (device_param->skipped == false) if (device_param->skipped == false)
@ -6432,272 +6414,336 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
opencl_devices_active++; opencl_devices_active++;
} }
}
}
}
/** backend_ctx->opencl_devices_cnt = opencl_devices_cnt;
* create context for each device backend_ctx->opencl_devices_active = opencl_devices_active;
*/
cl_context context; // all devices combined go into backend_* variables
/* backend_ctx->backend_devices_cnt = cuda_devices_cnt + opencl_devices_cnt;
cl_context_properties properties[3]; backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
properties[0] = CL_CONTEXT_PLATFORM; // find duplicate devices
properties[1] = (cl_context_properties) device_param->opencl_platform;
properties[2] = 0;
CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context); //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0))
*/ //{
// using force here enables both devices, which is the worst possible outcome
// many users force by default, so this is not a good idea
if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1; //if (user_options->force == false)
//{
backend_ctx_find_alias_devices (hashcat_ctx);
//{
//}
/** if (backend_ctx->backend_devices_active == 0)
* create command-queue {
*/ event_log_error (hashcat_ctx, "No devices found/left.");
cl_command_queue command_queue; return -1;
}
if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1; // now we can calculate the number of parallel running hook threads based on
// the number cpu cores and the number of active compute devices
// unless overwritten by the user
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)) if (user_options->hook_threads == HOOK_THREADS)
{ {
#define RUN_INSTRUCTION_CHECKS() const u32 processor_count = hc_get_processor_count ();
device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
if (backend_devices_idx > 0) const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0
{
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
if (is_same_device_type (device_param, device_param_prev) == true) user_options->hook_threads = processor_count_cu;
{ }
device_param->has_vadd = device_param_prev->has_vadd;
device_param->has_vaddc = device_param_prev->has_vaddc;
device_param->has_vadd_co = device_param_prev->has_vadd_co;
device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
device_param->has_vsub = device_param_prev->has_vsub;
device_param->has_vsubb = device_param_prev->has_vsubb;
device_param->has_vsub_co = device_param_prev->has_vsub_co;
device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
device_param->has_vadd3 = device_param_prev->has_vadd3;
device_param->has_vbfe = device_param_prev->has_vbfe;
device_param->has_vperm = device_param_prev->has_vperm;
}
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
#undef RUN_INSTRUCTION_CHECKS // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt)
}
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) if (backend_ctx->backend_devices_filter != (u64) -1)
{ {
const int sm = (device_param->sm_major * 10) + device_param->sm_minor; const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
device_param->has_add = (sm >= 12) ? true : false; if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask)
device_param->has_addc = (sm >= 12) ? true : false; {
device_param->has_sub = (sm >= 12) ? true : false; event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
device_param->has_subc = (sm >= 12) ? true : false; event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
device_param->has_bfe = (sm >= 20) ? true : false;
device_param->has_lop3 = (sm >= 50) ? true : false;
device_param->has_mov64 = (sm >= 10) ? true : false;
device_param->has_prmt = (sm >= 20) ? true : false;
/* return -1;
#define RUN_INSTRUCTION_CHECKS() \ }
device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ }
device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
if (backend_devices_idx > 0)
{
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
if (is_same_device_type (device_param, device_param_prev) == true) // time or resource intensive operations which we do not run if the corresponding device was skipped by the user
{
device_param->has_add = device_param_prev->has_add;
device_param->has_addc = device_param_prev->has_addc;
device_param->has_sub = device_param_prev->has_sub;
device_param->has_subc = device_param_prev->has_subc;
device_param->has_bfe = device_param_prev->has_bfe;
device_param->has_lop3 = device_param_prev->has_lop3;
device_param->has_mov64 = device_param_prev->has_mov64;
device_param->has_prmt = device_param_prev->has_prmt;
}
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
#undef RUN_INSTRUCTION_CHECKS if (backend_ctx->cuda)
*/ {
} // instruction test for cuda devices was replaced with fixed values (see above)
// device_available_mem /*
CUcontext cuda_context;
#define MAX_ALLOC_CHECKS_CNT 8192 if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1;
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE; if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1;
#if defined (_WIN) #define RUN_INSTRUCTION_CHECKS() \
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
#else device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))) device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
#endif device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
{ device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
// OK, so the problem here is the following: device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device, device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
// but there's no way to ask for available memory on the device. device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
// are doing so called lazy memory allocation on the device.
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
// running on the host we end up with an error type of this:
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
// The clEnqueueNDRangeKernel() is because of the lazy allocation
// The best way to workaround this problem is if we would be able to ask for available memory,
// The idea here is to try to evaluate available memory by allocating it till it errors
cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem)); if (backend_devices_idx > 0)
{
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
u64 c; if (is_same_device_type (device_param, device_param_prev) == true)
{
device_param->has_add = device_param_prev->has_add;
device_param->has_addc = device_param_prev->has_addc;
device_param->has_sub = device_param_prev->has_sub;
device_param->has_subc = device_param_prev->has_subc;
device_param->has_bfe = device_param_prev->has_bfe;
device_param->has_lop3 = device_param_prev->has_lop3;
device_param->has_mov64 = device_param_prev->has_mov64;
device_param->has_prmt = device_param_prev->has_prmt;
}
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) #undef RUN_INSTRUCTION_CHECKS
{
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
cl_int CL_err; if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1;
OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; */
}
tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err); if (backend_ctx->ocl)
{
for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++)
{
hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt];
if (CL_err != CL_SUCCESS) if (device_param->is_opencl == false) continue;
{
c--;
break; if (device_param->skipped == true) continue;
}
// transfer only a few byte should be enough to force the runtime to actually allocate the memory /**
* create context for each device
*/
cl_context context;
/*
cl_context_properties properties[3];
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = (cl_context_properties) device_param->opencl_platform;
properties[2] = 0;
CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
*/
u8 tmp_host[8]; if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1;
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; /**
* create command-queue
*/
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; cl_command_queue command_queue;
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1;
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; // instruction set
}
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE; if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
if (c > 0) {
#define RUN_INSTRUCTION_CHECKS()
device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \
if (backend_devices_idx > 0)
{
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
if (is_same_device_type (device_param, device_param_prev) == true)
{ {
device_param->device_available_mem *= c; device_param->has_vadd = device_param_prev->has_vadd;
device_param->has_vaddc = device_param_prev->has_vaddc;
device_param->has_vadd_co = device_param_prev->has_vadd_co;
device_param->has_vaddc_co = device_param_prev->has_vaddc_co;
device_param->has_vsub = device_param_prev->has_vsub;
device_param->has_vsubb = device_param_prev->has_vsubb;
device_param->has_vsub_co = device_param_prev->has_vsub_co;
device_param->has_vsubb_co = device_param_prev->has_vsubb_co;
device_param->has_vadd3 = device_param_prev->has_vadd3;
device_param->has_vbfe = device_param_prev->has_vbfe;
device_param->has_vperm = device_param_prev->has_vperm;
} }
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
// clean up #undef RUN_INSTRUCTION_CHECKS
}
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
{ {
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; // replaced with fixed values see non time intensive section above
if (tmp_device[c] != NULL) /*
{ #define RUN_INSTRUCTION_CHECKS() \
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1; device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
} device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
} device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \
device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \
hcfree (tmp_device); if (backend_devices_idx > 0)
} {
hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1];
hc_clReleaseCommandQueue (hashcat_ctx, command_queue); if (is_same_device_type (device_param, device_param_prev) == true)
{
device_param->has_add = device_param_prev->has_add;
device_param->has_addc = device_param_prev->has_addc;
device_param->has_sub = device_param_prev->has_sub;
device_param->has_subc = device_param_prev->has_subc;
device_param->has_bfe = device_param_prev->has_bfe;
device_param->has_lop3 = device_param_prev->has_lop3;
device_param->has_mov64 = device_param_prev->has_mov64;
device_param->has_prmt = device_param_prev->has_prmt;
}
else
{
RUN_INSTRUCTION_CHECKS();
}
}
else
{
RUN_INSTRUCTION_CHECKS();
}
hc_clReleaseContext (hashcat_ctx, context); #undef RUN_INSTRUCTION_CHECKS
*/
} }
}
}
backend_ctx->opencl_devices_cnt = opencl_devices_cnt; // available device memory
backend_ctx->opencl_devices_active = opencl_devices_active; // This test causes an GPU memory usage spike.
// In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist.
// We will simply not run it if that device was skipped by the user.
// all devices combined go into backend_* variables #define MAX_ALLOC_CHECKS_CNT 8192
#define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024)
backend_ctx->backend_devices_cnt = cuda_devices_cnt + opencl_devices_cnt; device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE;
backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active;
// find duplicate devices #if defined (_WIN)
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
#else
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)))
#endif
{
// OK, so the problem here is the following:
// There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device,
// but there's no way to ask for available memory on the device.
// In combination, most OpenCL runtimes implementation of clCreateBuffer()
// are doing so called lazy memory allocation on the device.
// Now, if the user has X11 (or a game or anything that takes a lot of GPU memory)
// running on the host we end up with an error type of this:
// clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE
// The clEnqueueNDRangeKernel() is because of the lazy allocation
// The best way to workaround this problem is if we would be able to ask for available memory,
// The idea here is to try to evaluate available memory by allocating it till it errors
//if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0)) cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem));
//{
// using force here enables both devices, which is the worst possible outcome
// many users force by default, so this is not a good idea
//if (user_options->force == false) u64 c;
//{
backend_ctx_find_alias_devices (hashcat_ctx);
//{
//}
if (backend_ctx->backend_devices_active == 0) for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
{ {
event_log_error (hashcat_ctx, "No devices found/left."); if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
return -1; cl_int CL_err;
}
// now we can calculate the number of parallel running hook threads based on OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
// the number cpu cores and the number of active compute devices
// unless overwritten by the user
if (user_options->hook_threads == HOOK_THREADS) tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
{
const u32 processor_count = hc_get_processor_count ();
const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0 if (CL_err != CL_SUCCESS)
{
c--;
user_options->hook_threads = processor_count_cu; break;
} }
// additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt) // transfer only a few byte should be enough to force the runtime to actually allocate the memory
if (backend_ctx->backend_devices_filter != (u64) -1) u8 tmp_host[8];
{
const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt);
if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask) if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
{
event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter.");
event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt);
return -1; if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
}
device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
if (c > 0)
{
device_param->device_available_mem *= c;
}
// clean up
for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
{
if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
if (tmp_device[c] != NULL)
{
if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
}
}
hcfree (tmp_device);
}
hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
hc_clReleaseContext (hashcat_ctx, context);
} }
} }

Loading…
Cancel
Save