diff --git a/docs/changes.txt b/docs/changes.txt index ee94d1cb0..7a654d207 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -12,6 +12,13 @@ - Fixed too early execution of some module functions which could make use of non-final values opts_type and opti_type - Fixed internal access on module option attribute OPTS_TYPE_SUGGEST_KG with the result that it was unused +- Fixed race condition resulting in out of memory error on startup if multiple hashcat instances are started at the same time + +## +## Improvements +## + +- Startup time: Improved the startup time by avoiding some time intensive operations for skipped devices * changes v6.1.0 -> v6.1.1 diff --git a/src/backend.c b/src/backend.c index 58aa8094f..939572903 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5540,7 +5540,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->skipped = true; } - // some attributes have to be hardcoded because they are used for instance in the build options + // some attributes have to be hardcoded values because they are used for instance in the build options device_param->device_local_mem_type = CL_LOCAL; device_param->opencl_device_type = CL_DEVICE_TYPE_GPU; @@ -5616,11 +5616,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) cuda_devices_active++; } - CUcontext cuda_context; - - if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1; - - if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1; + // instruction set // bcrypt optimization? //const int rc_cuCtxSetCacheConfig = hc_cuCtxSetCacheConfig (hashcat_ctx, CU_FUNC_CACHE_PREFER_SHARED); @@ -5638,46 +5634,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->has_mov64 = (sm >= 10) ? true : false; device_param->has_prmt = (sm >= 20) ? true : false; - /* - #define RUN_INSTRUCTION_CHECKS() \ - device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \ - device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - - if (backend_devices_idx > 0) - { - hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; - - if (is_same_device_type (device_param, device_param_prev) == true) - { - device_param->has_add = device_param_prev->has_add; - device_param->has_addc = device_param_prev->has_addc; - device_param->has_sub = device_param_prev->has_sub; - device_param->has_subc = device_param_prev->has_subc; - device_param->has_bfe = device_param_prev->has_bfe; - device_param->has_lop3 = device_param_prev->has_lop3; - device_param->has_mov64 = device_param_prev->has_mov64; - device_param->has_prmt = device_param_prev->has_prmt; - } - else - { - RUN_INSTRUCTION_CHECKS(); - } - } - else - { - RUN_INSTRUCTION_CHECKS(); - } + // device_available_mem - #undef RUN_INSTRUCTION_CHECKS - */ + CUcontext cuda_context; - // device_available_mem + if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1; + + if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1; size_t free = 0; size_t total = 0; @@ -6269,6 +6232,25 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) } } + // instruction set + + // fixed values works only for nvidia devices + // dynamical values for amd see time intensive section below + + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) + { + const int sm = (device_param->sm_major * 10) + device_param->sm_minor; + + device_param->has_add = (sm >= 12) ? true : false; + device_param->has_addc = (sm >= 12) ? true : false; + device_param->has_sub = (sm >= 12) ? true : false; + device_param->has_subc = (sm >= 12) ? true : false; + device_param->has_bfe = (sm >= 20) ? true : false; + device_param->has_lop3 = (sm >= 50) ? true : false; + device_param->has_mov64 = (sm >= 10) ? true : false; + device_param->has_prmt = (sm >= 20) ? true : false; + } + // common driver check if (device_param->skipped == false) @@ -6432,272 +6414,336 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) opencl_devices_active++; } + } + } + } - /** - * create context for each device - */ + backend_ctx->opencl_devices_cnt = opencl_devices_cnt; + backend_ctx->opencl_devices_active = opencl_devices_active; - cl_context context; + // all devices combined go into backend_* variables - /* - cl_context_properties properties[3]; + backend_ctx->backend_devices_cnt = cuda_devices_cnt + opencl_devices_cnt; + backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active; - properties[0] = CL_CONTEXT_PLATFORM; - properties[1] = (cl_context_properties) device_param->opencl_platform; - properties[2] = 0; + // find duplicate devices - CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context); - */ + //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0)) + //{ + // using force here enables both devices, which is the worst possible outcome + // many users force by default, so this is not a good idea - if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1; + //if (user_options->force == false) + //{ + backend_ctx_find_alias_devices (hashcat_ctx); + //{ + //} - /** - * create command-queue - */ + if (backend_ctx->backend_devices_active == 0) + { + event_log_error (hashcat_ctx, "No devices found/left."); - cl_command_queue command_queue; + return -1; + } - if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1; + // now we can calculate the number of parallel running hook threads based on + // the number cpu cores and the number of active compute devices + // unless overwritten by the user - if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)) - { - #define RUN_INSTRUCTION_CHECKS() - device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ - device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ - device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ - device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ - device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ - device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ + if (user_options->hook_threads == HOOK_THREADS) + { + const u32 processor_count = hc_get_processor_count (); - if (backend_devices_idx > 0) - { - hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; + const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0 - if (is_same_device_type (device_param, device_param_prev) == true) - { - device_param->has_vadd = device_param_prev->has_vadd; - device_param->has_vaddc = device_param_prev->has_vaddc; - device_param->has_vadd_co = device_param_prev->has_vadd_co; - device_param->has_vaddc_co = device_param_prev->has_vaddc_co; - device_param->has_vsub = device_param_prev->has_vsub; - device_param->has_vsubb = device_param_prev->has_vsubb; - device_param->has_vsub_co = device_param_prev->has_vsub_co; - device_param->has_vsubb_co = device_param_prev->has_vsubb_co; - device_param->has_vadd3 = device_param_prev->has_vadd3; - device_param->has_vbfe = device_param_prev->has_vbfe; - device_param->has_vperm = device_param_prev->has_vperm; - } - else - { - RUN_INSTRUCTION_CHECKS(); - } - } - else - { - RUN_INSTRUCTION_CHECKS(); - } + user_options->hook_threads = processor_count_cu; + } - #undef RUN_INSTRUCTION_CHECKS - } + // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt) - if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) - { - const int sm = (device_param->sm_major * 10) + device_param->sm_minor; + if (backend_ctx->backend_devices_filter != (u64) -1) + { + const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt); - device_param->has_add = (sm >= 12) ? true : false; - device_param->has_addc = (sm >= 12) ? true : false; - device_param->has_sub = (sm >= 12) ? true : false; - device_param->has_subc = (sm >= 12) ? true : false; - device_param->has_bfe = (sm >= 20) ? true : false; - device_param->has_lop3 = (sm >= 50) ? true : false; - device_param->has_mov64 = (sm >= 10) ? true : false; - device_param->has_prmt = (sm >= 20) ? true : false; + if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask) + { + event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter."); + event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt); - /* - #define RUN_INSTRUCTION_CHECKS() \ - device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \ - device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - - if (backend_devices_idx > 0) - { - hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; + return -1; + } + } - if (is_same_device_type (device_param, device_param_prev) == true) - { - device_param->has_add = device_param_prev->has_add; - device_param->has_addc = device_param_prev->has_addc; - device_param->has_sub = device_param_prev->has_sub; - device_param->has_subc = device_param_prev->has_subc; - device_param->has_bfe = device_param_prev->has_bfe; - device_param->has_lop3 = device_param_prev->has_lop3; - device_param->has_mov64 = device_param_prev->has_mov64; - device_param->has_prmt = device_param_prev->has_prmt; - } - else - { - RUN_INSTRUCTION_CHECKS(); - } - } - else - { - RUN_INSTRUCTION_CHECKS(); - } + // time or resource intensive operations which we do not run if the corresponding device was skipped by the user - #undef RUN_INSTRUCTION_CHECKS - */ - } + if (backend_ctx->cuda) + { + // instruction test for cuda devices was replaced with fixed values (see above) - // device_available_mem + /* + CUcontext cuda_context; - #define MAX_ALLOC_CHECKS_CNT 8192 - #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024) + if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1) return -1; - device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE; + if (hc_cuCtxSetCurrent (hashcat_ctx, cuda_context) == -1) return -1; - #if defined (_WIN) - if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) - #else - if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))) - #endif - { - // OK, so the problem here is the following: - // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device, - // but there's no way to ask for available memory on the device. - // In combination, most OpenCL runtimes implementation of clCreateBuffer() - // are doing so called lazy memory allocation on the device. - // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory) - // running on the host we end up with an error type of this: - // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE - // The clEnqueueNDRangeKernel() is because of the lazy allocation - // The best way to workaround this problem is if we would be able to ask for available memory, - // The idea here is to try to evaluate available memory by allocating it till it errors + #define RUN_INSTRUCTION_CHECKS() \ + device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \ + device_param->has_prmt = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem)); + if (backend_devices_idx > 0) + { + hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; - u64 c; + if (is_same_device_type (device_param, device_param_prev) == true) + { + device_param->has_add = device_param_prev->has_add; + device_param->has_addc = device_param_prev->has_addc; + device_param->has_sub = device_param_prev->has_sub; + device_param->has_subc = device_param_prev->has_subc; + device_param->has_bfe = device_param_prev->has_bfe; + device_param->has_lop3 = device_param_prev->has_lop3; + device_param->has_mov64 = device_param_prev->has_mov64; + device_param->has_prmt = device_param_prev->has_prmt; + } + else + { + RUN_INSTRUCTION_CHECKS(); + } + } + else + { + RUN_INSTRUCTION_CHECKS(); + } - for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) - { - if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + #undef RUN_INSTRUCTION_CHECKS - cl_int CL_err; + if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1) return -1; - OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; + */ + } - tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err); + if (backend_ctx->ocl) + { + for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++) + { + hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt]; - if (CL_err != CL_SUCCESS) - { - c--; + if (device_param->is_opencl == false) continue; - break; - } + if (device_param->skipped == true) continue; - // transfer only a few byte should be enough to force the runtime to actually allocate the memory + /** + * create context for each device + */ + + cl_context context; + + /* + cl_context_properties properties[3]; + + properties[0] = CL_CONTEXT_PLATFORM; + properties[1] = (cl_context_properties) device_param->opencl_platform; + properties[2] = 0; + + CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context); + */ - u8 tmp_host[8]; + if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1) return -1; - if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + /** + * create command-queue + */ - if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + cl_command_queue command_queue; - if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1) return -1; - if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - } + // instruction set - device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE; - if (c > 0) + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD)) + { + #define RUN_INSTRUCTION_CHECKS() + device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ + device_param->has_vadd_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ + device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ + device_param->has_vsub_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32 %0, vcc, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \ + device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ + device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r1)); }"); \ + + if (backend_devices_idx > 0) + { + hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; + + if (is_same_device_type (device_param, device_param_prev) == true) { - device_param->device_available_mem *= c; + device_param->has_vadd = device_param_prev->has_vadd; + device_param->has_vaddc = device_param_prev->has_vaddc; + device_param->has_vadd_co = device_param_prev->has_vadd_co; + device_param->has_vaddc_co = device_param_prev->has_vaddc_co; + device_param->has_vsub = device_param_prev->has_vsub; + device_param->has_vsubb = device_param_prev->has_vsubb; + device_param->has_vsub_co = device_param_prev->has_vsub_co; + device_param->has_vsubb_co = device_param_prev->has_vsubb_co; + device_param->has_vadd3 = device_param_prev->has_vadd3; + device_param->has_vbfe = device_param_prev->has_vbfe; + device_param->has_vperm = device_param_prev->has_vperm; } + else + { + RUN_INSTRUCTION_CHECKS(); + } + } + else + { + RUN_INSTRUCTION_CHECKS(); + } - // clean up + #undef RUN_INSTRUCTION_CHECKS + } - for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) - { - if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) + { + // replaced with fixed values see non time intensive section above - if (tmp_device[c] != NULL) - { - if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1; - } - } + /* + #define RUN_INSTRUCTION_CHECKS() \ + device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }"); \ + device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \ + device_param->has_prmt = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }"); \ - hcfree (tmp_device); - } + if (backend_devices_idx > 0) + { + hc_device_param_t *device_param_prev = &devices_param[backend_devices_idx - 1]; - hc_clReleaseCommandQueue (hashcat_ctx, command_queue); + if (is_same_device_type (device_param, device_param_prev) == true) + { + device_param->has_add = device_param_prev->has_add; + device_param->has_addc = device_param_prev->has_addc; + device_param->has_sub = device_param_prev->has_sub; + device_param->has_subc = device_param_prev->has_subc; + device_param->has_bfe = device_param_prev->has_bfe; + device_param->has_lop3 = device_param_prev->has_lop3; + device_param->has_mov64 = device_param_prev->has_mov64; + device_param->has_prmt = device_param_prev->has_prmt; + } + else + { + RUN_INSTRUCTION_CHECKS(); + } + } + else + { + RUN_INSTRUCTION_CHECKS(); + } - hc_clReleaseContext (hashcat_ctx, context); + #undef RUN_INSTRUCTION_CHECKS + */ } - } - } - backend_ctx->opencl_devices_cnt = opencl_devices_cnt; - backend_ctx->opencl_devices_active = opencl_devices_active; + // available device memory + // This test causes an GPU memory usage spike. + // In case there are multiple hashcat instances starting at the same time this will cause GPU out of memory errors which otherwise would not exist. + // We will simply not run it if that device was skipped by the user. - // all devices combined go into backend_* variables + #define MAX_ALLOC_CHECKS_CNT 8192 + #define MAX_ALLOC_CHECKS_SIZE (64 * 1024 * 1024) - backend_ctx->backend_devices_cnt = cuda_devices_cnt + opencl_devices_cnt; - backend_ctx->backend_devices_active = cuda_devices_active + opencl_devices_active; + device_param->device_available_mem = device_param->device_global_mem - MAX_ALLOC_CHECKS_SIZE; - // find duplicate devices + #if defined (_WIN) + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV)) + #else + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) || (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))) + #endif + { + // OK, so the problem here is the following: + // There's just CL_DEVICE_GLOBAL_MEM_SIZE to ask OpenCL about the total memory on the device, + // but there's no way to ask for available memory on the device. + // In combination, most OpenCL runtimes implementation of clCreateBuffer() + // are doing so called lazy memory allocation on the device. + // Now, if the user has X11 (or a game or anything that takes a lot of GPU memory) + // running on the host we end up with an error type of this: + // clEnqueueNDRangeKernel(): CL_MEM_OBJECT_ALLOCATION_FAILURE + // The clEnqueueNDRangeKernel() is because of the lazy allocation + // The best way to workaround this problem is if we would be able to ask for available memory, + // The idea here is to try to evaluate available memory by allocating it till it errors - //if ((cuda_devices_cnt > 0) && (opencl_devices_cnt > 0)) - //{ - // using force here enables both devices, which is the worst possible outcome - // many users force by default, so this is not a good idea + cl_mem *tmp_device = (cl_mem *) hccalloc (MAX_ALLOC_CHECKS_CNT, sizeof (cl_mem)); - //if (user_options->force == false) - //{ - backend_ctx_find_alias_devices (hashcat_ctx); - //{ - //} + u64 c; - if (backend_ctx->backend_devices_active == 0) - { - event_log_error (hashcat_ctx, "No devices found/left."); + for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) + { + if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; - return -1; - } + cl_int CL_err; - // now we can calculate the number of parallel running hook threads based on - // the number cpu cores and the number of active compute devices - // unless overwritten by the user + OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl; - if (user_options->hook_threads == HOOK_THREADS) - { - const u32 processor_count = hc_get_processor_count (); + tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err); - const u32 processor_count_cu = CEILDIV (processor_count, backend_ctx->backend_devices_active); // should never reach 0 + if (CL_err != CL_SUCCESS) + { + c--; - user_options->hook_threads = processor_count_cu; - } + break; + } - // additional check to see if the user has chosen a device that is not within the range of available devices (i.e. larger than devices_cnt) + // transfer only a few byte should be enough to force the runtime to actually allocate the memory - if (backend_ctx->backend_devices_filter != (u64) -1) - { - const u64 backend_devices_cnt_mask = ~(((u64) -1 >> backend_ctx->backend_devices_cnt) << backend_ctx->backend_devices_cnt); + u8 tmp_host[8]; - if (backend_ctx->backend_devices_filter > backend_devices_cnt_mask) - { - event_log_error (hashcat_ctx, "An invalid device was specified using the --backend-devices parameter."); - event_log_error (hashcat_ctx, "The specified device was higher than the number of available devices (%u).", backend_ctx->backend_devices_cnt); + if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; - return -1; + if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + + if (ocl->clEnqueueReadBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + + if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break; + } + + device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE; + + if (c > 0) + { + device_param->device_available_mem *= c; + } + + // clean up + + for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++) + { + if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break; + + if (tmp_device[c] != NULL) + { + if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1; + } + } + + hcfree (tmp_device); + } + + hc_clReleaseCommandQueue (hashcat_ctx, command_queue); + + hc_clReleaseContext (hashcat_ctx, context); } }