1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-22 16:18:09 +00:00

Kernel Threads: Use warp size / wavefront size query instead of hardcoded values as base for kernel threads

Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename
Remove some unused function symbol lookups in HIP library
This commit is contained in:
Jens Steube 2021-07-22 11:46:47 +02:00
parent a09efb2634
commit 5024865d87
3 changed files with 87 additions and 62 deletions

View File

@ -20,6 +20,7 @@
- Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase - Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase
- Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M - Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M
- Kernel Threads: Use warp size / wavefront size query instead of hardcoded values as base for kernel threads
- Shared Memory: Calculate kernel dynamic memory size based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN - Shared Memory: Calculate kernel dynamic memory size based on CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
- Slow Kernels: Set some of the slowest kernels to OPTS_TYPE_MP_MULTI_DISABLE - Slow Kernels: Set some of the slowest kernels to OPTS_TYPE_MP_MULTI_DISABLE
@ -27,6 +28,7 @@
## Technical ## Technical
## ##
- Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename
- HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows - HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows
- SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs - SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs

View File

@ -1118,6 +1118,8 @@ typedef struct hc_device_param
int sm_minor; int sm_minor;
u32 kernel_exec_timeout; u32 kernel_exec_timeout;
u32 kernel_preferred_wgs_multiple;
st_status_t st_status; st_status_t st_status;
int vector_width; int vector_width;

View File

@ -2516,14 +2516,14 @@ int hip_init (hashcat_ctx_t *hashcat_ctx)
HC_LOAD_FUNC_HIP (hip, hipCtxCreate, hipCtxCreate, HIP_HIPCTXCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxCreate, hipCtxCreate, HIP_HIPCTXCREATE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxDestroy, hipCtxDestroy, HIP_HIPCTXDESTROY, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxDestroy, hipCtxDestroy, HIP_HIPCTXDESTROY, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxGetCacheConfig, hipCtxGetCacheConfig, HIP_HIPCTXGETCACHECONFIG, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipCtxGetCacheConfig, hipCtxGetCacheConfig, HIP_HIPCTXGETCACHECONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxGetCurrent, hipCtxGetCurrent, HIP_HIPCTXGETCURRENT, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipCtxGetCurrent, hipCtxGetCurrent, HIP_HIPCTXGETCURRENT, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxGetSharedMemConfig, hipCtxGetSharedMemConfig, HIP_HIPCTXGETSHAREDMEMCONFIG, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipCtxGetSharedMemConfig, hipCtxGetSharedMemConfig, HIP_HIPCTXGETSHAREDMEMCONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent, hipCtxPopCurrent, HIP_HIPCTXPOPCURRENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent, hipCtxPopCurrent, HIP_HIPCTXPOPCURRENT, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent, hipCtxPushCurrent, HIP_HIPCTXPUSHCURRENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent, hipCtxPushCurrent, HIP_HIPCTXPUSHCURRENT, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxSetCacheConfig, hipCtxSetCacheConfig, HIP_HIPCTXSETCACHECONFIG, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSetCacheConfig, hipCtxSetCacheConfig, HIP_HIPCTXSETCACHECONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent, hipCtxSetCurrent, HIP_HIPCTXSETCURRENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent, hipCtxSetCurrent, HIP_HIPCTXSETCURRENT, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxSetSharedMemConfig, hipCtxSetSharedMemConfig, HIP_HIPCTXSETSHAREDMEMCONFIG, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipCtxSetSharedMemConfig, hipCtxSetSharedMemConfig, HIP_HIPCTXSETSHAREDMEMCONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize, hipCtxSynchronize, HIP_HIPCTXSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize, hipCtxSynchronize, HIP_HIPCTXSYNCHRONIZE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute, hipDeviceGetAttribute, HIP_HIPDEVICEGETATTRIBUTE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute, hipDeviceGetAttribute, HIP_HIPDEVICEGETATTRIBUTE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount, hipGetDeviceCount, HIP_HIPDEVICEGETCOUNT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount, hipGetDeviceCount, HIP_HIPDEVICEGETCOUNT, HIP, 1);
@ -2539,9 +2539,9 @@ int hip_init (hashcat_ctx_t *hashcat_ctx)
HC_LOAD_FUNC_HIP (hip, hipEventSynchronize, hipEventSynchronize, HIP_HIPEVENTSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipEventSynchronize, hipEventSynchronize, HIP_HIPEVENTSYNCHRONIZE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute, hipFuncGetAttribute, HIP_HIPFUNCGETATTRIBUTE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute, hipFuncGetAttribute, HIP_HIPFUNCGETATTRIBUTE, HIP, 1);
//HC_LOAD_FUNC_HIP (hip, hipFuncSetAttribute, hipFuncSetAttribute, HIP_HIPFUNCSETATTRIBUTE, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipFuncSetAttribute, hipFuncSetAttribute, HIP_HIPFUNCSETATTRIBUTE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipFuncSetCacheConfig, hipFuncSetCacheConfig, HIP_HIPFUNCSETCACHECONFIG, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipFuncSetCacheConfig, hipFuncSetCacheConfig, HIP_HIPFUNCSETCACHECONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipFuncSetSharedMemConfig, hipFuncSetSharedMemConfig, HIP_HIPFUNCSETSHAREDMEMCONFIG, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipFuncSetSharedMemConfig, hipFuncSetSharedMemConfig, HIP_HIPFUNCSETSHAREDMEMCONFIG, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipGetErrorName, hipGetErrorName, HIP_HIPGETERRORNAME, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipGetErrorName, hipGetErrorName, HIP_HIPGETERRORNAME, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipGetErrorString, hipGetErrorString, HIP_HIPGETERRORSTRING, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipGetErrorString, hipGetErrorString, HIP_HIPGETERRORSTRING, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipInit, hipInit, HIP_HIPINIT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipInit, hipInit, HIP_HIPINIT, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipLaunchKernel, hipModuleLaunchKernel, HIP_HIPLAUNCHKERNEL, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipLaunchKernel, hipModuleLaunchKernel, HIP_HIPLAUNCHKERNEL, HIP, 1);
@ -2556,20 +2556,21 @@ int hip_init (hashcat_ctx_t *hashcat_ctx)
HC_LOAD_FUNC_HIP (hip, hipMemFree, hipFree, HIP_HIPMEMFREE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipMemFree, hipFree, HIP_HIPMEMFREE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipMemFreeHost, hipFreeHost, HIP_HIPMEMFREEHOST, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipMemFreeHost, hipFreeHost, HIP_HIPMEMFREEHOST, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipMemGetInfo, hipMemGetInfo, HIP_HIPMEMGETINFO, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipMemGetInfo, hipMemGetInfo, HIP_HIPMEMGETINFO, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipMemsetD32, hipMemsetD32, HIP_HIPMEMSETD32, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipMemsetD32, hipMemsetD32, HIP_HIPMEMSETD32, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipMemsetD8, hipMemsetD8, HIP_HIPMEMSETD8, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipMemsetD8, hipMemsetD8, HIP_HIPMEMSETD8, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction, hipModuleGetFunction, HIP_HIPMODULEGETFUNCTION, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction, hipModuleGetFunction, HIP_HIPMODULEGETFUNCTION, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal, hipModuleGetGlobal, HIP_HIPMODULEGETGLOBAL, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal, hipModuleGetGlobal, HIP_HIPMODULEGETGLOBAL, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleLoad, hipModuleLoad, HIP_HIPMODULELOAD, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoad, hipModuleLoad, HIP_HIPMODULELOAD, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleLoadData, hipModuleLoadData, HIP_HIPMODULELOADDATA, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoadData, hipModuleLoadData, HIP_HIPMODULELOADDATA, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx, hipModuleLoadDataEx, HIP_HIPMODULELOADDATAEX, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx, hipModuleLoadDataEx, HIP_HIPMODULELOADDATAEX, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipModuleUnload, hipModuleUnload, HIP_HIPMODULEUNLOAD, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipModuleUnload, hipModuleUnload, HIP_HIPMODULEUNLOAD, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipProfilerStart, hipProfilerStart, HIP_HIPPROFILERSTART, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipProfilerStart, hipProfilerStart, HIP_HIPPROFILERSTART, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipProfilerStop, hipProfilerStop, HIP_HIPPROFILERSTOP, HIP, 1); //HC_LOAD_FUNC_HIP (hip, hipProfilerStop, hipProfilerStop, HIP_HIPPROFILERSTOP, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipStreamCreate, hipStreamCreate, HIP_HIPSTREAMCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamCreate, hipStreamCreate, HIP_HIPSTREAMCREATE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipStreamDestroy, hipStreamDestroy, HIP_HIPSTREAMDESTROY, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamDestroy, hipStreamDestroy, HIP_HIPSTREAMDESTROY, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize, hipStreamSynchronize, HIP_HIPSTREAMSYNCHRONIZE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize, hipStreamSynchronize, HIP_HIPSTREAMSYNCHRONIZE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipStreamWaitEvent, hipStreamWaitEvent, HIP_HIPSTREAMWAITEVENT, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipStreamWaitEvent, hipStreamWaitEvent, HIP_HIPSTREAMWAITEVENT, HIP, 1);
//TODO HIP?
#if defined (WITH_CUBINX) #if defined (WITH_CUBINX)
HC_LOAD_FUNC_HIP (hip, hipLinkCreate, hipLinkCreate, HIP_HIPLINKCREATE, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipLinkCreate, hipLinkCreate, HIP_HIPLINKCREATE, HIP, 1);
HC_LOAD_FUNC_HIP (hip, hipLinkAddData, hipLinkAddData, HIP_HIPLINKADDDATA, HIP, 1); HC_LOAD_FUNC_HIP (hip, hipLinkAddData, hipLinkAddData, HIP_HIPLINKADDDATA, HIP, 1);
@ -8070,6 +8071,18 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->kernel_exec_timeout = kernel_exec_timeout; device_param->kernel_exec_timeout = kernel_exec_timeout;
// warp size
int warp_size = 0;
if (hc_cuDeviceGetAttribute (hashcat_ctx, &warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuda_device) == -1)
{
device_param->skipped = true;
continue;
}
device_param->kernel_preferred_wgs_multiple = warp_size;
// max_shared_memory_per_block // max_shared_memory_per_block
int max_shared_memory_per_block = 0; int max_shared_memory_per_block = 0;
@ -8436,11 +8449,23 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->kernel_exec_timeout = kernel_exec_timeout; device_param->kernel_exec_timeout = kernel_exec_timeout;
// warp size
int warp_size = 0;
if (hc_hipDeviceGetAttribute (hashcat_ctx, &warp_size, HIP_DEVICE_ATTRIBUTE_WARP_SIZE, hip_device) == -1)
{
device_param->skipped = true;
continue;
}
device_param->kernel_preferred_wgs_multiple = warp_size;
// max_shared_memory_per_block // max_shared_memory_per_block
int max_shared_memory_per_block = 0; int max_shared_memory_per_block = 0;
if (hc_hipDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, hip_device) == -1) if (hc_hipDeviceGetAttribute (hashcat_ctx, &max_shared_memory_per_block, HIP_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, hip_device) == -1)
{ {
device_param->skipped = true; device_param->skipped = true;
continue; continue;
@ -9053,6 +9078,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
hcfree (device_extensions); hcfree (device_extensions);
// kernel_preferred_wgs_multiple
// There is global query for this attribute on OpenCL that is not linked to a specific kernel, so we set it to a fixed value
// Later in the code, we add vendor specific extensions to query it
device_param->kernel_preferred_wgs_multiple = 8;
// device_local_mem_type // device_local_mem_type
cl_device_local_mem_type device_local_mem_type; cl_device_local_mem_type device_local_mem_type;
@ -9332,6 +9364,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
{ {
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
{ {
cl_uint device_wavefront_width_amd;
// from https://www.khronos.org/registry/OpenCL/extensions/amd/cl_amd_device_attribute_query.txt
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof (device_wavefront_width_amd), &device_wavefront_width_amd, NULL) == -1)
{
device_param->skipped = true;
continue;
}
device_param->kernel_preferred_wgs_multiple = device_wavefront_width_amd;
cl_device_topology_amd amdtopo; cl_device_topology_amd amdtopo;
if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL) == -1) if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_TOPOLOGY_AMD, sizeof (amdtopo), &amdtopo, NULL) == -1)
@ -9348,6 +9393,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV)) if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
{ {
cl_uint device_warp_size_nv;
// from deps/OpenCL-Headers/CL/cl_ext.h
#define CL_DEVICE_WARP_SIZE_NV 0x4003
if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_WARP_SIZE_NV, sizeof (device_warp_size_nv), &device_warp_size_nv, NULL) == -1)
{
device_param->skipped = true;
continue;
}
device_param->kernel_preferred_wgs_multiple = device_warp_size_nv;
cl_uint pci_bus_id_nv; // is cl_uint the right type for them?? cl_uint pci_bus_id_nv; // is cl_uint the right type for them??
cl_uint pci_slot_id_nv; cl_uint pci_slot_id_nv;
@ -10324,9 +10382,7 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param)
{ {
// for all CPU we just do 1 ... // for all CPU we just do 1 ...
const u32 cpu_prefered_thread_count = 1; kernel_threads_max = MIN (kernel_threads_max, 1);
kernel_threads_max = MIN (kernel_threads_max, cpu_prefered_thread_count);
} }
else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{ {
@ -10334,28 +10390,15 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param)
if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
{ {
const u32 gpu_prefered_thread_count = 8; kernel_threads_max = MIN (kernel_threads_max, 8);
kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
} }
else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
{ {
u32 gpu_prefered_thread_count = 64; kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
{
// based on clinfo output: Preferred work group size multiple (kernel)
gpu_prefered_thread_count = 32;
}
kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
} }
else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)
{ {
u32 gpu_prefered_thread_count = 64; kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
kernel_threads_max = MIN (kernel_threads_max, gpu_prefered_thread_count);
} }
} }
@ -10371,6 +10414,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
{ {
const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; const hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
const folder_config_t *folder_config = hashcat_ctx->folder_config; const folder_config_t *folder_config = hashcat_ctx->folder_config;
const user_options_t *user_options = hashcat_ctx->user_options;
bool cached = true; bool cached = true;
@ -10658,9 +10702,9 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
//hiprtc_options[1] = "--device-as-default-execution-space"; //hiprtc_options[1] = "--device-as-default-execution-space";
//hiprtc_options[2] = "--gpu-architecture"; //hiprtc_options[2] = "--gpu-architecture";
//hc_asprintf (&hiprtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor); hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple);
hiprtc_options[0] = "--gpu-max-threads-per-block=64"; //hiprtc_options[0] = "--gpu-max-threads-per-block=64";
hiprtc_options[1] = "-nocudainc"; hiprtc_options[1] = "-nocudainc";
hiprtc_options[2] = "-nocudalib"; hiprtc_options[2] = "-nocudalib";
hiprtc_options[3] = ""; hiprtc_options[3] = "";
@ -11408,24 +11452,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
} }
else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{ {
// for GPU we need to distinguish by vendor native_threads = device_param->kernel_preferred_wgs_multiple;
if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK)
{
native_threads = 8;
}
else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
{
native_threads = 64;
}
else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)
{
native_threads = 64;
}
else
{
native_threads = 32;
}
} }
else else
{ {
@ -11751,7 +11778,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 }; char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 };
const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s", const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%u",
backend_ctx->comptime, backend_ctx->comptime,
backend_ctx->cuda_driver_version, backend_ctx->cuda_driver_version,
backend_ctx->hip_driver_version, backend_ctx->hip_driver_version,
@ -11759,7 +11786,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
device_param->opencl_platform_vendor_id, device_param->opencl_platform_vendor_id,
device_param->device_name, device_param->device_name,
device_param->opencl_device_version, device_param->opencl_device_version,
device_param->opencl_driver_version); device_param->opencl_driver_version,
(user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple);
md5_ctx_t md5_ctx; md5_ctx_t md5_ctx;
@ -12078,7 +12106,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE; const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE;
const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%u-%s", const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%u-%u-%s",
backend_ctx->comptime, backend_ctx->comptime,
backend_ctx->cuda_driver_version, backend_ctx->cuda_driver_version,
backend_ctx->hip_driver_version, backend_ctx->hip_driver_version,
@ -12090,6 +12118,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
device_param->vector_width, device_param->vector_width,
hashconfig->kern_type, hashconfig->kern_type,
extra_value, extra_value,
(user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple,
build_options_module_buf); build_options_module_buf);
md5_ctx_t md5_ctx; md5_ctx_t md5_ctx;
@ -14810,14 +14839,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
} }
} }
// we
if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)
{
device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64);
device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64);
}
/** /**
* now everything that depends on threads and accel, basically dynamic workload * now everything that depends on threads and accel, basically dynamic workload
*/ */