diff --git a/docs/changes.txt b/docs/changes.txt index c3d74641b..6dc0d9bff 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -20,6 +20,7 @@ - Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase - Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M +- Kernel Threads: Use warp size / wavefront size query instead of hardcoded values as base for kernel threads - Shared Memory: Calculate kernel dynamic memory size based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN - Slow Kernels: Set some of the slowest kernels to OPTS_TYPE_MP_MULTI_DISABLE @@ -27,6 +28,7 @@ ## Technical ## +- Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename - HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows - SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs diff --git a/include/types.h b/include/types.h index 45a7eb332..43e5a55da 100644 --- a/include/types.h +++ b/include/types.h @@ -1118,6 +1118,8 @@ typedef struct hc_device_param int sm_minor; u32 kernel_exec_timeout; + u32 kernel_preferred_wgs_multiple; + st_status_t st_status; int vector_width; diff --git a/src/backend.c b/src/backend.c index 7f61baf57..31d6bade1 100644 --- a/src/backend.c +++ b/src/backend.c @@ -2516,14 +2516,14 @@ int hip_init (hashcat_ctx_t *hashcat_ctx) HC_LOAD_FUNC_HIP (hip, hipCtxCreate, hipCtxCreate, HIP_HIPCTXCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxDestroy, hipCtxDestroy, HIP_HIPCTXDESTROY, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipCtxGetCacheConfig, hipCtxGetCacheConfig, HIP_HIPCTXGETCACHECONFIG, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipCtxGetCurrent, hipCtxGetCurrent, HIP_HIPCTXGETCURRENT, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipCtxGetSharedMemConfig, hipCtxGetSharedMemConfig, HIP_HIPCTXGETSHAREDMEMCONFIG, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipCtxGetCacheConfig, hipCtxGetCacheConfig, HIP_HIPCTXGETCACHECONFIG, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipCtxGetCurrent, hipCtxGetCurrent, HIP_HIPCTXGETCURRENT, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipCtxGetSharedMemConfig, hipCtxGetSharedMemConfig, HIP_HIPCTXGETSHAREDMEMCONFIG, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent, hipCtxPopCurrent, HIP_HIPCTXPOPCURRENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent, hipCtxPushCurrent, HIP_HIPCTXPUSHCURRENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSetCacheConfig, hipCtxSetCacheConfig, HIP_HIPCTXSETCACHECONFIG, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent, hipCtxSetCurrent, HIP_HIPCTXSETCURRENT, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipCtxSetSharedMemConfig, hipCtxSetSharedMemConfig, HIP_HIPCTXSETSHAREDMEMCONFIG, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipCtxSetSharedMemConfig, hipCtxSetSharedMemConfig, HIP_HIPCTXSETSHAREDMEMCONFIG, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize, hipCtxSynchronize, HIP_HIPCTXSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute, hipDeviceGetAttribute, HIP_HIPDEVICEGETATTRIBUTE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount, hipGetDeviceCount, HIP_HIPDEVICEGETCOUNT, HIP, 1); @@ -2539,9 +2539,9 @@ int hip_init (hashcat_ctx_t *hashcat_ctx) HC_LOAD_FUNC_HIP (hip, hipEventSynchronize, hipEventSynchronize, HIP_HIPEVENTSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute, hipFuncGetAttribute, HIP_HIPFUNCGETATTRIBUTE, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipFuncSetAttribute, hipFuncSetAttribute, HIP_HIPFUNCSETATTRIBUTE, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipFuncSetCacheConfig, hipFuncSetCacheConfig, HIP_HIPFUNCSETCACHECONFIG, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipFuncSetSharedMemConfig, hipFuncSetSharedMemConfig, HIP_HIPFUNCSETSHAREDMEMCONFIG, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipGetErrorName, hipGetErrorName, HIP_HIPGETERRORNAME, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipFuncSetCacheConfig, hipFuncSetCacheConfig, HIP_HIPFUNCSETCACHECONFIG, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipFuncSetSharedMemConfig, hipFuncSetSharedMemConfig, HIP_HIPFUNCSETSHAREDMEMCONFIG, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipGetErrorName, hipGetErrorName, HIP_HIPGETERRORNAME, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipGetErrorString, hipGetErrorString, HIP_HIPGETERRORSTRING, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipInit, hipInit, HIP_HIPINIT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipLaunchKernel, hipModuleLaunchKernel, HIP_HIPLAUNCHKERNEL, HIP, 1); @@ -2556,20 +2556,21 @@ int hip_init (hashcat_ctx_t *hashcat_ctx) HC_LOAD_FUNC_HIP (hip, hipMemFree, hipFree, HIP_HIPMEMFREE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipMemFreeHost, hipFreeHost, HIP_HIPMEMFREEHOST, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipMemGetInfo, hipMemGetInfo, HIP_HIPMEMGETINFO, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipMemsetD32, hipMemsetD32, HIP_HIPMEMSETD32, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipMemsetD8, hipMemsetD8, HIP_HIPMEMSETD8, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipMemsetD32, hipMemsetD32, HIP_HIPMEMSETD32, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipMemsetD8, hipMemsetD8, HIP_HIPMEMSETD8, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction, hipModuleGetFunction, HIP_HIPMODULEGETFUNCTION, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal, hipModuleGetGlobal, HIP_HIPMODULEGETGLOBAL, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoad, hipModuleLoad, HIP_HIPMODULELOAD, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoadData, hipModuleLoadData, HIP_HIPMODULELOADDATA, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx, hipModuleLoadDataEx, HIP_HIPMODULELOADDATAEX, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleUnload, hipModuleUnload, HIP_HIPMODULEUNLOAD, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipProfilerStart, hipProfilerStart, HIP_HIPPROFILERSTART, HIP, 1); - HC_LOAD_FUNC_HIP (hip, hipProfilerStop, hipProfilerStop, HIP_HIPPROFILERSTOP, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipProfilerStart, hipProfilerStart, HIP_HIPPROFILERSTART, HIP, 1); + //HC_LOAD_FUNC_HIP (hip, hipProfilerStop, hipProfilerStop, HIP_HIPPROFILERSTOP, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamCreate, hipStreamCreate, HIP_HIPSTREAMCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamDestroy, hipStreamDestroy, HIP_HIPSTREAMDESTROY, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize, hipStreamSynchronize, HIP_HIPSTREAMSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamWaitEvent, hipStreamWaitEvent, HIP_HIPSTREAMWAITEVENT, HIP, 1); + //TODO HIP? #if defined (WITH_CUBINX) HC_LOAD_FUNC_HIP (hip, hipLinkCreate, hipLinkCreate, HIP_HIPLINKCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipLinkAddData, hipLinkAddData, HIP_HIPLINKADDDATA, HIP, 1); @@ -8070,6 +8071,18 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->kernel_exec_timeout = kernel_exec_timeout; + // warp size + + int warp_size = 0; + + if (hc_cuDeviceGetAttribute (hashcat_ctx, &warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuda_device) == -1) + { + device_param->skipped = true; + continue; + } + + device_param->kernel_preferred_wgs_multiple = warp_size; + // max_shared_memory_per_block int max_shared_memory_per_block = 0; @@ -8436,11 +8449,23 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->kernel_exec_timeout = kernel_exec_timeout; + // warp size + + int warp_size = 0; + + if (hc_hipDeviceGetAttribute (hashcat_ctx, &warp_size, HIP_DEVICE_ATTRIBUTE_WARP_SIZE, hip_device) == -1) + { + device_param->skipped = true; + continue; + } + + device_param->kernel_preferred_wgs_multiple = warp_size; + // max_shared_memory_per_block int max_shared_memory_per_block = 0; - if (hc_hipDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, hip_device) == -1) + if (hc_hipDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, hip_device) == -1) { device_param->skipped = true; continue; @@ -9053,6 +9078,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) hcfree (device_extensions); + // kernel_preferred_wgs_multiple + + // There is global query for this attribute on OpenCL that is not linked to a specific kernel, so we set it to a fixed value + // Later in the code, we add vendor specific extensions to query it + + device_param->kernel_preferred_wgs_multiple = 8; + // device_local_mem_type cl_device_local_mem_type device_local_mem_type; @@ -9332,6 +9364,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) { if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) { + cl_uint device_wavefront_width_amd; + + // from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_device_attribute_query.txt + #define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 + + if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof (device_wavefront_width_amd), &device_wavefront_width_amd, NULL) == -1) + { + device_param->skipped = true; + continue; + } + + device_param->kernel_preferred_wgs_multiple = device_wavefront_width_amd; + cl_device_topology_amd amdtopo; if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL) == -1) @@ -9348,6 +9393,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV)) { + cl_uint device_warp_size_nv; + + // from deps/OpenCL-Headers/CL/cl_ext.h + #define CL_DEVICE_WARP_SIZE_NV 0x4003 + + if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_WARP_SIZE_NV, sizeof (device_warp_size_nv), &device_warp_size_nv, NULL) == -1) + { + device_param->skipped = true; + continue; + } + + device_param->kernel_preferred_wgs_multiple = device_warp_size_nv; + cl_uint pci_bus_id_nv; // is cl_uint the right type for them?? cl_uint pci_slot_id_nv; @@ -10324,9 +10382,7 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param) { // for all CPU we just do 1 ... - const u32 cpu_prefered_thread_count = 1; - - kernel_threads_max = MIN (kernel_threads_max, cpu_prefered_thread_count); + kernel_threads_max = MIN (kernel_threads_max, 1); } else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { @@ -10334,28 +10390,15 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param) if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) { - const u32 gpu_prefered_thread_count = 8; - - kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count); + kernel_threads_max = MIN (kernel_threads_max, 8); } else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) { - u32 gpu_prefered_thread_count = 64; - - if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) - { - // based on clinfo output: Preferred work group size multiple (kernel) - - gpu_prefered_thread_count = 32; - } - - kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count); + kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); } else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) { - u32 gpu_prefered_thread_count = 64; - - kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count); + kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); } } @@ -10371,6 +10414,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p { const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; const folder_config_t *folder_config = hashcat_ctx->folder_config; + const user_options_t *user_options = hashcat_ctx->user_options; bool cached = true; @@ -10658,9 +10702,9 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p //hiprtc_options[1] = "--device-as-default-execution-space"; //hiprtc_options[2] = "--gpu-architecture"; - //hc_asprintf (&hiprtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor); + hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple); - hiprtc_options[0] = "--gpu-max-threads-per-block=64"; + //hiprtc_options[0] = "--gpu-max-threads-per-block=64"; hiprtc_options[1] = "-nocudainc"; hiprtc_options[2] = "-nocudalib"; hiprtc_options[3] = ""; @@ -11408,24 +11452,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { - // for GPU we need to distinguish by vendor - - if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) - { - native_threads = 8; - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) - { - native_threads = 64; - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) - { - native_threads = 64; - } - else - { - native_threads = 32; - } + native_threads = device_param->kernel_preferred_wgs_multiple; } else { @@ -11751,7 +11778,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 }; - const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s", + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%u", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_driver_version, @@ -11759,7 +11786,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->opencl_platform_vendor_id, device_param->device_name, device_param->opencl_device_version, - device_param->opencl_driver_version); + device_param->opencl_driver_version, + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple); md5_ctx_t md5_ctx; @@ -12078,7 +12106,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE; - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%u-%s", + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%u-%u-%s", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_driver_version, @@ -12090,6 +12118,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->vector_width, hashconfig->kern_type, extra_value, + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple, build_options_module_buf); md5_ctx_t md5_ctx; @@ -14810,14 +14839,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } } - // we - - if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) - { - device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64); - device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64); - } - /** * now everything that depends on threads and accel, basically dynamic workload */