From d38d40c8ba403ad0c37e43bbfe33da22cb21dcb5 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Thu, 29 Jul 2021 10:49:44 +0200
Subject: [PATCH] Unlock all GPU threads for AMD GPUs if WaveFront size is 32
 (basically new models) Add new hash-modes to tools/benchmark_deep.pl Fix
 MINGW issue on 64 bit constant in refactored kernel-accel limiting section

---
 src/backend.c           | 29 ++++++++++++++++++++++-------
 tools/benchmark_deep.pl | 17 +++++++++++++++++
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index 63e0beb3b..057425250 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -10411,11 +10411,21 @@ static u32 get_kernel_threads (const hc_device_param_t *device_param)
     }
     else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
     {
-      kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
+      if (device_param->kernel_preferred_wgs_multiple == 64)
+      {
+        // only older AMD GPUs with WaveFront size 64 benefit from this
+
+        kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
+      }
     }
     else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)
     {
-      kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
+      if (device_param->kernel_preferred_wgs_multiple == 64)
+      {
+        // only older AMD GPUs with WaveFront size 64 benefit from this
+
+        kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple);
+      }
     }
   }
 
@@ -10719,7 +10729,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
       //hiprtc_options[1] = "--device-as-default-execution-space";
       //hiprtc_options[2] = "--gpu-architecture";
 
-      hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple);
+      hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX));
 
       //hiprtc_options[0] = "--gpu-max-threads-per-block=64";
       hiprtc_options[1] = "-nocudainc";
@@ -11804,7 +11814,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->device_name,
       device_param->opencl_device_version,
       device_param->opencl_driver_version,
-      (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple);
+      (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX));
 
     md5_ctx_t md5_ctx;
 
@@ -12139,7 +12149,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         device_param->vector_width,
         hashconfig->kern_type,
         extra_value,
-        (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_preferred_wgs_multiple,
+        (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX),
         build_options_module_buf);
 
       md5_ctx_t md5_ctx;
@@ -14883,6 +14893,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     u32 kernel_accel_max = device_param->kernel_accel_max;
 
     // We need to deal with the situation that the total video RAM > total host RAM.
+    // For the opposite direction, we do that in the loop section below.
     // Especially in multi-GPU setups that is very likely.
     // The buffers which actually take a lot of memory (except for SCRYPT) are the ones for the password candidates.
     // They are stored in an aligned order for better performance, but this increases the memory pressure.
@@ -14893,7 +14904,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     // We need to hard-code some value, let's assume that (in 2021) the host has at least 8GB ram per active GPU
 
-    const u64 SIZE_8GB = 8UL * 1024 * 1024 * 1024;
+    const u64 SIZE_8GB = 8ULL * 1024 * 1024 * 1024;
 
     u64 accel_limit = SIZE_8GB;
 
@@ -14909,6 +14920,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     accel_limit /= 3;
 
+    // Is possible that the GPU simply has too much hardware resources and 8GB per GPU is not enough, but OTOH we can't get lower than 1
+
+    accel_limit = MAX (accel_limit, 1);
+
     // I think vector size is not required because vector_size is dividing the pws_cnt in run_kernel()
 
     kernel_accel_max = MIN (kernel_accel_max, accel_limit);
@@ -14921,7 +14936,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       return -1;
     }
 
-    // find out if we would request too much memory on memory blocks which are based on kernel_accel
+    // Opposite direction check: find out if we would request too much memory on memory blocks which are based on kernel_accel
 
     u64 size_pws      = 4;
     u64 size_pws_amp  = 4;
diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl
index df6777441..ba2db8856 100755
--- a/tools/benchmark_deep.pl
+++ b/tools/benchmark_deep.pl
@@ -230,16 +230,19 @@ my @hash_types =
   13751,
   13761,
   13771,
+  13781,
   13800,
   13900,
   14000,
   14100,
   14400,
+  14500,
   14700,
   14800,
   14900,
   15000,
   15100,
+  15200,
   15300,
   15400,
   15500,
@@ -250,10 +253,13 @@ my @hash_types =
   16200,
   16300,
   16400,
+  16500,
   16600,
+  16700,
   16800,
   16801,
   16900,
+  17210,
   17300,
   17400,
   17500,
@@ -333,12 +339,23 @@ my @hash_types =
   24700,
   24800,
   24900,
+  25000,
+  25100,
+  25200,
   25300,
   25400,
   25500,
+  25700,
   25900,
   26000,
   26100,
+  26200,
+  26300,
+  26401,
+  26402,
+  26403,
+  26500,
+  26600,
 );
 
 if (scalar @ARGV)