From 25f1c12e3c7ff4efc71b9dd10aa2de7f81dfc122 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 28 Jul 2021 07:51:27 +0200
Subject: [PATCH] SCRYPT Kernels: Add more optimized values for some new NV/AMD
 GPUs and new semi-automated derivation process description Blowfish Kernels:
 Backport optimizations reducing bank conflicts from bcrypt to Password Safe
 v2 and Open Document Format (ODF) 1.1

---
 docs/changes.txt           |  4 ++
 hashcat.hctune             | 88 ++++++++++++++++++++++++++------------
 src/modules/module_09000.c | 63 +++++++++++++++++++++------
 3 files changed, 115 insertions(+), 40 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index 0c31c4cdf..8cad8aa31 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -19,6 +19,7 @@
 ##
 
 - AMD GPUs: Add inline assembly code for md5crypt/sha256crypt, PDF 1.7, 7-Zip, RAR3, Samsung Android and Windows Phone 8+
+- AMD GPUs: On Apple OpenCL platform, we ask for the preferred kernel thread size rather than hard-coding 32
 - Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase
 - Blowfish Kernels: Backport optimizations reducing bank conflicts from bcrypt to Password Safe v2 and Open Document Format (ODF) 1.1
 - Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M
@@ -31,7 +32,10 @@
 ##
 
 - ADL: Updated support for AMD Display Library to 15.0, updated datatypes and added support for OverDrive 7 and 8 based GPUs
+- AMD Driver: Updated requirement for AMD Linux driver to ROCm 4.4 or later because of new HIP Interface
+- AMD Driver: Updated requirement for AMD Windows driver to Adrenalin 21.2.1 or later because of new ADL library
 - Commandline: Throw an error if separator character given by the user with -p option is not exactly 1 byte
+- ECC secp256k1: Removed the inline assembly code for AMD GPUs because the latest JIT compilers optimize it with the same efficiency
 - HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows
 - Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename
 - SCRYPT Kernels: Add more optimized values for some new NV/AMD GPUs
diff --git a/hashcat.hctune b/hashcat.hctune
index 2e1951eef..2b99ee149 100644
--- a/hashcat.hctune
+++ b/hashcat.hctune
@@ -279,7 +279,14 @@ GeForce_RTX_3090                                ALIAS_nv_sm50_or_higher
 ##
 
 Device_738c                                     ALIAS_AMD_MI100
+
+AMD_Radeon_(TM)_RX_480_Graphics                 ALIAS_AMD_RX480
+
+Vega_10_XL/XT_[Radeon_RX_Vega_56/64]            ALIAS_AMD_Vega64
+AMD_Radeon_Vega_64                              ALIAS_AMD_Vega64
+
 Device_73bf                                     ALIAS_AMD_RX6900XT
+AMD_Radeon_RX_6900_XT                           ALIAS_AMD_RX6900XT
 
 #############
 ## ENTRIES ##
@@ -486,22 +493,41 @@ DEVICE_TYPE_GPU                                 *       14500   1       A
 ##
 ## Find the ideal -n value, then store it here along with the proper compute device name.
 ## Formatting guidelines are availabe at the top of this document.
+##
+## -------------------------------------------------
+##
+## You can also ignore all theoretical derivations and semi-automate the process in the real scenario (I prefer this approach):
+##
+## 1. For example, to find the value for 8900, first create a valid hash for 8900 as follows:
+##
+## $ ./hashcat --example-hashes -m 8900 | grep Example.Hash | grep -v Format | cut -b 25- > tmp.hash.8900
+##
+## 2. Now let it iterate through all -n values to a certain point. In this case, I'm using 200, but in general it's a value that is at least twice that of the multiprocessor. If you don't mind you can just leave it as it is, it just runs a little longer.
+##
+## $ export i=1; while [ $i -ne 201 ]; do echo $i; ./hashcat --quiet tmp.hash.8900 --keep-guessing --self-test-disable --markov-disable --restore-disable --outfile-autohex-disable --wordlist-autohex-disable --potfile-disable --logfile-disable --hwmon-disable --status --status-timer 1 --runtime 28 --machine-readable --optimized-kernel-enable --workload-profile 3 --hash-type 8900 --attack-mode 3 ?b?b?b?b?b?b?b --backend-devices 1 --force -n $i; i=$(($i+1)); done | tee x
+##
+## 3. Determine the highest measured H/s speed. But don't just use the highest value. Instead, use the number that seems most stable, usually at the beginning.
+##
+## $ grep "$(printf 'STATUS\t3')" x | cut -f4 -d$'\t' | sort -n | tail
+##
+## 4. To match the speed you have chosen to the correct value in the "x" file, simply search for it in it. Then go up a little on the block where you found him. The value -n is the single value that begins before the block start. If you have multiple blocks at the same speed, choose the lowest value for -n
+##
 
 ## 4GB
-GeForce_GTX_980                                 *       8900    1      28       A
+GeForce_GTX_980                                 *       8900    1      29       A
 GeForce_GTX_980                                 *       9300    1     128       A
-GeForce_GTX_980                                 *       15700   1      28       A
-GeForce_GTX_980                                 *       22700   1      28       A
+GeForce_GTX_980                                 *       15700   1      24       A
+GeForce_GTX_980                                 *       22700   1      29       A
 
 ## 8GB
-GeForce_GTX_1080                                *       8900    1      14       A
+GeForce_GTX_1080                                *       8900    1      15       A
 GeForce_GTX_1080                                *       9300    1     256       A
-GeForce_GTX_1080                                *       15700   1      14       A
-GeForce_GTX_1080                                *       22700   1      14       A
+GeForce_GTX_1080                                *       15700   1      28       A
+GeForce_GTX_1080                                *       22700   1      15       A
 
 ## 11GB
 GeForce_RTX_2080_Ti                             *       8900    1      68       A
-GeForce_RTX_2080_Ti                             *       9300    1     532       A
+GeForce_RTX_2080_Ti                             *       9300    1     528       A
 GeForce_RTX_2080_Ti                             *       15700   1      68       A
 GeForce_RTX_2080_Ti                             *       22700   1      68       A
 
@@ -509,7 +535,7 @@ GeForce_RTX_2080_Ti                             *       22700   1      68
 GeForce_RTX_3060_Ti                             *       8900    1      51       A
 GeForce_RTX_3060_Ti                             *       9300    1     256       A
 GeForce_RTX_3060_Ti                             *       15700   1      11       A
-GeForce_RTX_3060_Ti                             *       22700   1      43       A
+GeForce_RTX_3060_Ti                             *       22700   1      51       A
 
 ## 8GB
 GeForce_RTX_3070                                *       8900    1      46       A
@@ -517,26 +543,32 @@ GeForce_RTX_3070                                *       9300    1     368
 GeForce_RTX_3070                                *       15700   1      22       A
 GeForce_RTX_3070                                *       22700   1      46       A
 
+## 24GB
+GeForce_RTX_3090                                *       8900    1      82       A
+GeForce_RTX_3090                                *       9300    1     984       A
+GeForce_RTX_3090                                *       15700   1      82       A
+GeForce_RTX_3090                                *       22700   1      82       A
+
 ## 4GB
-AMD_Radeon_(TM)_RX_480_Graphics                 *       8900    1      14       A
-AMD_Radeon_(TM)_RX_480_Graphics                 *       9300    1     126       A
-AMD_Radeon_(TM)_RX_480_Graphics                 *       15700   1      14       A
-AMD_Radeon_(TM)_RX_480_Graphics                 *       22700   1      14       A
+ALIAS_AMD_RX480                                 *       8900    1      15       A
+ALIAS_AMD_RX480                                 *       9300    1     232       A
+ALIAS_AMD_RX480                                 *       15700   1      58       A
+ALIAS_AMD_RX480                                 *       22700   1      15       A
 
 ## 8GB
-Vega_10_XL/XT_[Radeon_RX_Vega_56/64]            *       8900    1      28       A
-Vega_10_XL/XT_[Radeon_RX_Vega_56/64]            *       9300    1     442       A
-Vega_10_XL/XT_[Radeon_RX_Vega_56/64]            *       15700   1      28       A
-Vega_10_XL/XT_[Radeon_RX_Vega_56/64]            *       22700   1      28       A
-
-## 32GB, WF64
-ALIAS_AMD_MI100                                 *       8900    1      76       A
-ALIAS_AMD_MI100                                 *       9300    1     288       A
-ALIAS_AMD_MI100                                 *       15700   1      76       A
-ALIAS_AMD_MI100                                 *       22700   1      76       A
-
-## 16GB, WF32
-ALIAS_AMD_RX6900XT                              *       8900    1      62       A
-ALIAS_AMD_RX6900XT                              *       9300    1     704       A
-ALIAS_AMD_RX6900XT                              *       15700   1      62       A
-ALIAS_AMD_RX6900XT                              *       22700   1      62       A
+ALIAS_AMD_Vega64                                *       8900    1      31       A
+ALIAS_AMD_Vega64                                *       9300    1     440       A
+ALIAS_AMD_Vega64                                *       15700   1      53       A
+ALIAS_AMD_Vega64                                *       22700   1      31       A
+
+## 32GB
+ALIAS_AMD_MI100                                 *       8900    1      79       A
+ALIAS_AMD_MI100                                 *       9300    1    1000       A
+ALIAS_AMD_MI100                                 *       15700   1     120       A
+ALIAS_AMD_MI100                                 *       22700   1      79       A
+
+## 16GB
+ALIAS_AMD_RX6900XT                              *       8900    1      59       A
+ALIAS_AMD_RX6900XT                              *       9300    1     720       A
+ALIAS_AMD_RX6900XT                              *       15700   1      56       A
+ALIAS_AMD_RX6900XT                              *       22700   1      59       A
diff --git a/src/modules/module_09000.c b/src/modules/module_09000.c
index 3e3158c0a..28963a1fc 100644
--- a/src/modules/module_09000.c
+++ b/src/modules/module_09000.c
@@ -22,7 +22,8 @@ static const u64   KERN_TYPE      = 9000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
 static const u64   OPTS_TYPE      = OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
-                                  | OPTS_TYPE_AUTODETECT_DISABLE;
+                                  | OPTS_TYPE_AUTODETECT_DISABLE
+                                  | OPTS_TYPE_DYNAMIC_SHARED;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "0a3f352686e5eb5be173e668a4fff5cd5df420927e1da2d5d4052340160637e3e6a5a92841a188ed240e13b919f3d91694bd4c0acba79271e9c08a83ea5ad387cbb74d5884066a1cb5a8caa80d847079168f84823847c631dbe3a834f1bc496acfebac3bff1608bf1c857717f8f428e07b5e2cb12aaeddfa83d7dcb6d840234d08b84f8ca6c6e562af73eea13148f7902bcaf0220d3e36eeeff1d37283dc421483a2791182614ebb";
@@ -75,16 +76,25 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 {
   char *jit_build_options = NULL;
 
+  // this mode heavily depends on the available shared memory size
+  // note the kernel need to have some special code changes in order to make use to use post-48k memory region
+  // we need to set some macros
+
+  bool use_dynamic = false;
+
+  if (device_param->is_cuda == true)
+  {
+    use_dynamic = true;
+  }
+
   // this uses some nice feedback effect.
   // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
   // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
   // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
 
-  u32 fixed_local_size = 0;
-
   if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
-    fixed_local_size = 1;
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
   }
   else
   {
@@ -100,29 +110,58 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 
       if (device_param->is_opencl == true)
       {
-        overhead = 4;
+        overhead = 1;
       }
     }
 
     if (user_options->kernel_threads_chgd == true)
     {
-      fixed_local_size = user_options->kernel_threads;
+      u32 fixed_local_size = user_options->kernel_threads;
+
+      if (use_dynamic == true)
+      {
+        if ((fixed_local_size * 4096) > device_param->kernel_dynamic_local_mem_size_memset)
+        {
+          // otherwise out-of-bound reads
 
-      // otherwise out-of-bound reads
+          fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096;
+        }
 
-      if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+        hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
+      }
+      else
       {
-        fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+        if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead))
+        {
+          // otherwise out-of-bound reads
+
+          fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+        }
+
+        hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
       }
     }
     else
     {
-      fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+      if (use_dynamic == true)
+      {
+        // using kernel_dynamic_local_mem_size_memset is a bit hackish.
+        // we had to brute-force this value out of an already loaded CUDA function.
+        // there's no official way to query for this value.
+
+        const u32 fixed_local_size = device_param->kernel_dynamic_local_mem_size_memset / 4096;
+
+        hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D DYNAMIC_LOCAL", fixed_local_size);
+      }
+      else
+      {
+        const u32 fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096;
+
+        hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
+      }
     }
   }
 
-  hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
-
   return jit_build_options;
 }