From d38d40c8ba403ad0c37e43bbfe33da22cb21dcb5 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 29 Jul 2021 10:49:44 +0200 Subject: [PATCH] Unlock all GPU threads for AMD GPUs if WaveFront size is 32 (basically new models) Add new hash-modes to tools/benchmark_deep.pl Fix MINGW issue on 64 bit constant in refactored kernel-accel limiting section --- src/backend.c | 29 ++++++++++++++++++++++------- tools/benchmark_deep.pl | 17 +++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/backend.c b/src/backend.c index 63e0beb3b..057425250 100644 --- a/src/backend.c +++ b/src/backend.c @@ -10411,11 +10411,21 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param) } else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) { - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); + if (device_param->kernel_preferred_wgs_multiple == 64) + { + // only older AMD GPUs with WaveFront size 64 benefit from this + + kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); + } } else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) { - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); + if (device_param->kernel_preferred_wgs_multiple == 64) + { + // only older AMD GPUs with WaveFront size 64 benefit from this + + kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); + } } } @@ -10719,7 +10729,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p //hiprtc_options[1] = "--device-as-default-execution-space"; //hiprtc_options[2] = "--gpu-architecture"; - hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple); + hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); //hiprtc_options[0] = "--gpu-max-threads-per-block=64"; hiprtc_options[1] = "-nocudainc"; @@ -11804,7 +11814,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->device_name, device_param->opencl_device_version, device_param->opencl_driver_version, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple); + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); md5_ctx_t md5_ctx; @@ -12139,7 +12149,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->vector_width, hashconfig->kern_type, extra_value, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple, + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX), build_options_module_buf); md5_ctx_t md5_ctx; @@ -14883,6 +14893,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) u32 kernel_accel_max = device_param->kernel_accel_max; // We need to deal with the situation that the total video RAM > total host RAM. + // For the opposite direction, we do that in the loop section below. // Especially in multi-GPU setups that is very likely. // The buffers which actually take a lot of memory (except for SCRYPT) are the ones for the password candidates. // They are stored in an aligned order for better performance, but this increases the memory pressure. @@ -14893,7 +14904,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // We need to hard-code some value, let's assume that (in 2021) the host has at least 8GB ram per active GPU - const u64 SIZE_8GB = 8UL * 1024 * 1024 * 1024; + const u64 SIZE_8GB = 8ULL * 1024 * 1024 * 1024; u64 accel_limit = SIZE_8GB; @@ -14909,6 +14920,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) accel_limit /= 3; + // Is possible that the GPU simply has too much hardware resources and 8GB per GPU is not enough, but OTOH we can't get lower than 1 + + accel_limit = MAX (accel_limit, 1); + // I think vector size is not required because vector_size is dividing the pws_cnt in run_kernel() kernel_accel_max = MIN (kernel_accel_max, accel_limit); @@ -14921,7 +14936,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) return -1; } - // find out if we would request too much memory on memory blocks which are based on kernel_accel + // Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel u64 size_pws = 4; u64 size_pws_amp = 4; diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index df6777441..ba2db8856 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -230,16 +230,19 @@ my @hash_types = 13751, 13761, 13771, + 13781, 13800, 13900, 14000, 14100, 14400, + 14500, 14700, 14800, 14900, 15000, 15100, + 15200, 15300, 15400, 15500, @@ -250,10 +253,13 @@ my @hash_types = 16200, 16300, 16400, + 16500, 16600, + 16700, 16800, 16801, 16900, + 17210, 17300, 17400, 17500, @@ -333,12 +339,23 @@ my @hash_types = 24700, 24800, 24900, + 25000, + 25100, + 25200, 25300, 25400, 25500, + 25700, 25900, 26000, 26100, + 26200, + 26300, + 26401, + 26402, + 26403, + 26500, + 26600, ); if (scalar @ARGV)