From 55f53ba07624ca44e9bc7c9943424e1f36dc21f9 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 11 May 2025 17:16:03 +0200
Subject: [PATCH 01/37] assigned -H to --hash-info

---
 docs/changes.txt                |   2 +
 extra/tab_completion/hashcat.sh |   6 +-
 include/types.h                 |   6 +-
 src/backend.c                   |   2 +-
 src/bitmap.c                    |   2 +-
 src/combinator.c                |   2 +-
 src/cpt.c                       |   2 +-
 src/debugfile.c                 |   2 +-
 src/dictstat.c                  |   2 +-
 src/hwmon.c                     |   2 +-
 src/induct.c                    |   2 +-
 src/interface.c                 |   2 +-
 src/loopback.c                  |   2 +-
 src/main.c                      |  12 +-
 src/terminal.c                  | 224 ++++++++++++++++++++++++++------
 src/usage.c                     |   2 +-
 src/user_options.c              |  29 +++--
 17 files changed, 230 insertions(+), 71 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index 50b23505f..f0f74d925 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -130,6 +130,8 @@
 - Metal Backend: allow use of devices with Metal if runtime version is >= 200
 - Metal Backend: disable Metal devices only if at least one OpenCL device is active
 - User Options: added --metal-compiler-runtime option
+- User Options: assigned -H to --hash-info
+- Hash-Info: show more details using -HH
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
diff --git a/extra/tab_completion/hashcat.sh b/extra/tab_completion/hashcat.sh
index f823cda72..953775345 100755
--- a/extra/tab_completion/hashcat.sh
+++ b/extra/tab_completion/hashcat.sh
@@ -425,7 +425,7 @@ _hashcat ()
   local HIDDEN_FILES_AGGRESSIVE="${HIDDEN_FILES}|hcmask|hcchr"
   local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s ?h ?H'
 
-  local SHORT_OPTS="-m -a -V -h -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y"
+  local SHORT_OPTS="-m -a -V -h -H -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y"
   local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --stdin-timeout-abort --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-inverse --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-json --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --deprecated-check-disable --left --username --dynamic-x --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --hash-info --backend-ignore-cuda --backend-ignore-opencl --backend-ignore-hip --backend-ignore-metal --backend-info --backend-devices --backend-devices-virtual --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --multiply-accel-disable --self-test-disable --slow-candidates --brain-server --brain-server-timer --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password --identify"
   local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --stdin-timeout-abort --markov-hcstat2 --markov-threshold --runtime --session --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --backend-devices --backend-devices-virtual --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-server-timer --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-session-whitelist"
 
@@ -729,11 +729,11 @@ _hashcat ()
 
   while [ ${h} -le ${COMP_CWORD} ]; do
 
-    if   [[ "${COMP_WORDS[h]}" == "-a" ]]; then
+    if [[ "${COMP_WORDS[h]}" == "-a" ]]; then
 
       attack_mode=${COMP_WORDS[$((h + 1))]}
 
-    elif   [[ "${COMP_WORDS[h]}" == -a* ]]; then
+    elif [[ "${COMP_WORDS[h]}" == -a* ]]; then
 
       attack_mode=${COMP_WORDS[h]:2}
 
diff --git a/include/types.h b/include/types.h
index e6ea946f9..834bb999e 100644
--- a/include/types.h
+++ b/include/types.h
@@ -646,7 +646,7 @@ typedef enum user_options_defaults
   #else
   HWMON_TEMP_ABORT         = 90,
   #endif
-  HASH_INFO                = false,
+  HASH_INFO                = 0,
   HASH_MODE                = 0,
   HCCAPX_MESSAGE_PAIR      = 0,
   HEX_CHARSET              = false,
@@ -761,7 +761,7 @@ typedef enum user_options_map
   IDX_DYNAMIC_X                 = 0xff55,
   IDX_ENCODING_FROM             = 0xff15,
   IDX_ENCODING_TO               = 0xff16,
-  IDX_HASH_INFO                 = 0xff17,
+  IDX_HASH_INFO                 = 'H', // 0xff17
   IDX_FORCE                     = 0xff18,
   IDX_HWMON_DISABLE             = 0xff19,
   IDX_HWMON_TEMP_ABORT          = 0xff1a,
@@ -2344,7 +2344,6 @@ typedef struct user_options
   bool         deprecated_check;
   bool         dynamic_x;
   bool         hwmon;
-  bool         hash_info;
   bool         hex_charset;
   bool         hex_salt;
   bool         hex_wordlist;
@@ -2429,6 +2428,7 @@ typedef struct user_options
   #endif
   u32          debug_mode;
   u32          hwmon_temp_abort;
+  u32          hash_info;
   int          hash_mode;
   u32          hccapx_message_pair;
   u32          hook_threads;
diff --git a/src/backend.c b/src/backend.c
index 2892cd6b4..430fff33d 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -4258,8 +4258,8 @@ int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
   backend_ctx->enabled = false;
 
   if (user_options->usage      > 0)    return 0;
+  if (user_options->hash_info  > 0)    return 0;
 
-  if (user_options->hash_info == true) return 0;
   if (user_options->keyspace  == true) return 0;
   if (user_options->left      == true) return 0;
   if (user_options->show      == true) return 0;
diff --git a/src/bitmap.c b/src/bitmap.c
index 367b16a7c..1652b7cb9 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -81,8 +81,8 @@ int bitmap_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/combinator.c b/src/combinator.c
index 654ada03f..47715ff5e 100644
--- a/src/combinator.c
+++ b/src/combinator.c
@@ -21,8 +21,8 @@ int combinator_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
diff --git a/src/cpt.c b/src/cpt.c
index 8dc393c3a..97627b8c0 100644
--- a/src/cpt.c
+++ b/src/cpt.c
@@ -17,8 +17,8 @@ int cpt_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/debugfile.c b/src/debugfile.c
index 20ee3ad1f..25b0a6033 100644
--- a/src/debugfile.c
+++ b/src/debugfile.c
@@ -118,9 +118,9 @@ int debugfile_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/dictstat.c b/src/dictstat.c
index a607cf7d6..416eb90b3 100644
--- a/src/dictstat.c
+++ b/src/dictstat.c
@@ -58,9 +58,9 @@ int dictstat_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/hwmon.c b/src/hwmon.c
index d18d12552..dffc59cd5 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -1228,8 +1228,8 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)     return 0;
   if (user_options->backend_info   > 0)     return 0;
+  if (user_options->hash_info      > 0)     return 0;
 
-  if (user_options->hash_info     == true)  return 0;
   if (user_options->keyspace      == true)  return 0;
   if (user_options->left          == true)  return 0;
   if (user_options->show          == true)  return 0;
diff --git a/src/induct.c b/src/induct.c
index 3d9755034..476abe958 100644
--- a/src/induct.c
+++ b/src/induct.c
@@ -41,9 +41,9 @@ int induct_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/interface.c b/src/interface.c
index 691e2931b..d52c0b1dc 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -355,7 +355,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx)
 
     hashconfig->has_optimized_kernel = hc_path_read (source_file);
 
-    if (user_options->hash_info == false)
+    if (user_options->hash_info == 0 || user_options->hash_info > 1)
     {
       if (user_options->optimized_kernel == true)
       {
diff --git a/src/loopback.c b/src/loopback.c
index 28dcff0d0..b5799a435 100644
--- a/src/loopback.c
+++ b/src/loopback.c
@@ -62,9 +62,9 @@ int loopback_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/main.c b/src/main.c
index 085ce38b4..f7a820cce 100644
--- a/src/main.c
+++ b/src/main.c
@@ -192,12 +192,13 @@ static void main_outerloop_starting (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA
 
   status_ctx->shutdown_outer = false;
 
-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
   if (user_options->keyspace     == true) return;
   if (user_options->stdout_flag  == true) return;
   if (user_options->speed_only   == true) return;
   if (user_options->identify     == true) return;
-  if (user_options->backend_info  > 0)    return;
 
   if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
   {
@@ -269,10 +270,11 @@ static void main_cracker_finished (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYB
   const user_options_t       *user_options       = hashcat_ctx->user_options;
   const user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
   if (user_options->keyspace     == true) return;
   if (user_options->stdout_flag  == true) return;
-  if (user_options->backend_info  > 0)    return;
 
   // if we had a prompt, clear it
 
@@ -1328,7 +1330,7 @@ int main (int argc, char **argv)
 
       rc_final = 0;
     }
-    else if (user_options->hash_info == true)
+    else if (user_options->hash_info > 0)
     {
       hash_info (hashcat_ctx);
 
diff --git a/src/terminal.c b/src/terminal.c
index 211a8b24f..9bcbdb4cf 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -87,7 +87,7 @@ void welcome_screen (hashcat_ctx_t *hashcat_ctx, const char *version_tag)
     event_log_info (hashcat_ctx, "%s (%s) starting in autodetect mode", PROGNAME, version_tag);
     event_log_info (hashcat_ctx, NULL);
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
     event_log_info (hashcat_ctx, "%s (%s) starting in hash-info mode", PROGNAME, version_tag);
     event_log_info (hashcat_ctx, NULL);
@@ -634,7 +634,7 @@ void compress_terminal_line_length (char *out_buf, const size_t keep_from_beginn
   *ptr1 = 0;
 }
 
-void json_encode (char *text, char *escaped)
+void json_encode (const char *text, char *escaped)
 {
   /*
    * Based on https://www.freeformatter.com/json-escape.html, below these 7 different chars
@@ -667,6 +667,8 @@ void json_encode (char *text, char *escaped)
 
 void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_options_extra)
 {
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
   if (hashconfig_init (hashcat_ctx) == 0)
   {
     hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
@@ -677,25 +679,70 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     printf ("\"category\": \"%s\", ", strhashcategory (hashconfig->hash_category));
     printf ("\"slow_hash\": %s, ", (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? "false" : "true");
 
-    printf ("\"password_len_min\": %u, ", hashconfig->pw_min);
-    printf ("\"password_len_max\": %u, ", hashconfig->pw_max);
-
     printf ("\"is_deprecated\": %s, ", (module_ctx->module_deprecated_notice != MODULE_DEFAULT) ? "true" : "false");
 
-    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT) {
-      const char *deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
-      printf ("\"deprecated_notice\": \"%s\", ", deprecated_notice);
+    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT)
+    {
+      const char *t_deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
+
+      char *t_deprecated_notice_json_encoded = (char *) hcmalloc (strlen (t_deprecated_notice) * 2);
+
+      json_encode (t_deprecated_notice, t_deprecated_notice_json_encoded);
+
+      printf ("\"deprecated_notice\": \"%s\", ", t_deprecated_notice_json_encoded);
+
+      hcfree (t_deprecated_notice_json_encoded);
     }
+    else
+    {
+      printf ("\"deprecated_notice\": \"%s\", ", "N/A");
+    }
+
+    const char *t_pw_desc = (hashconfig->opts_type & OPTS_TYPE_PT_HEX) ? "HEX" : "plain";
+
+    u32 t_pw_min = hashconfig->pw_min;
+    u32 t_pw_max = hashconfig->pw_max;
+
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_PT_HEX)
+      {
+        t_pw_min *= 2;
+        t_pw_max *= 2;
+      }
+    }
+
+    printf ("\"password_type\": %s, ", t_pw_desc);
+    printf ("\"password_len_min\": %u, ", t_pw_min);
+    printf ("\"password_len_max\": %u, ", t_pw_max);
 
     printf ("\"is_salted\": %s, ", (hashconfig->is_salted == true) ? "true" : "false");
 
     if (hashconfig->is_salted == true)
     {
       u32 t = hashconfig->salt_type;
-      const char *t_desc = (t == SALT_TYPE_EMBEDDED) ? "embedded" : (t == SALT_TYPE_GENERIC) ? "generic" : "virtual";
-      printf ("\"salt_type\": \"%s\", ", t_desc);
-      printf ("\"salt_len_min\": %u, ", hashconfig->salt_min);
-      printf ("\"salt_len_max\": %u, ", hashconfig->salt_max);
+
+      const char *t_salt_desc = (t == SALT_TYPE_EMBEDDED) ? "embedded" : (t == SALT_TYPE_GENERIC) ? "generic" : "virtual";
+
+      printf ("\"salt_type\": \"%s\", ", t_salt_desc);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        u32 t_salt_min = hashconfig->salt_min;
+        u32 t_salt_max = hashconfig->salt_max;
+
+        if (user_options->hash_info > 1)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_ST_HEX)
+          {
+            t_salt_min *= 2;
+            t_salt_max *= 2;
+          }
+        }
+
+        printf ("\"salt_len_min\": %u, ", t_salt_min);
+        printf ("\"salt_len_max\": %u, ", t_salt_max);
+      }
     }
 
     if ((hashconfig->has_pure_kernel) && (hashconfig->has_optimized_kernel))
@@ -711,6 +758,39 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
       printf ("\"kernel_type\": %s, ", "[ \"optimized\" ]");
     }
 
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        printf ("\"kernel_type_filter\": %s, ", "[ \"optimized\" ]");
+      }
+      else
+      {
+        printf ("\"kernel_type_filter\": %s, ", "[ \"pure\" ]");
+      }
+
+      printf ("\"attack_mode_filter\": %d, ", user_options->attack_mode);
+
+      // almost always 1 and -1
+      printf ("\"hashes_count_min\": %d, ", hashconfig->hashes_count_min);
+      printf ("\"hashes_count_max\": %d, ", hashconfig->hashes_count_max);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        bool multi_hash_same_salt = true;
+
+        if ((hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL) == 0)
+        {
+          if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+          {
+            multi_hash_same_salt = false;
+          }
+        }
+
+        printf ("\"hashes_with_same_salt\": %s, ", (multi_hash_same_salt == true) ? "true" : "false");
+      }
+    }
+
     if ((hashconfig->st_hash != NULL) && (hashconfig->st_pass != NULL))
     {
       if (hashconfig->opts_type & OPTS_TYPE_BINARY_HASHFILE)
@@ -731,7 +811,7 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
 
       char *example_hash_json_encoded = (char *) hcmalloc (strlen (hashconfig->st_hash) * 2);
 
-      json_encode ((char *)hashconfig->st_hash, example_hash_json_encoded);
+      json_encode (hashconfig->st_hash, example_hash_json_encoded);
 
       printf ("\"example_hash\": \"%s\", ", example_hash_json_encoded);
 
@@ -807,6 +887,7 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     printf ("\"autodetect_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_AUTODETECT_DISABLE) ? "false" : "true");
     printf ("\"self_test_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) ? "false" : "true");
     printf ("\"potfile_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_POTFILE_NOPASS) ? "false" : "true");
+    printf ("\"keep_guessing\": %s, ", (hashconfig->opts_type & OPTS_TYPE_SUGGEST_KG) ? "true" : "false");
     printf ("\"custom_plugin\": %s, ", (hashconfig->opts_type & OPTS_TYPE_STOCK_MODULE) ? "false" : "true");
 
     if (hashconfig->opts_type & OPTS_TYPE_PT_ALWAYS_ASCII)
@@ -821,8 +902,6 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     {
       printf ("\"plaintext_encoding\": %s", "[ \"ASCII\", \"HEX\" ]");
     }
-
-    event_log_info (hashcat_ctx, NULL);
   }
 
   printf (" }");
@@ -832,30 +911,73 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
 
 void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_options_extra)
 {
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
   if (hashconfig_init (hashcat_ctx) == 0)
   {
     hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+    module_ctx_t *module_ctx = hashcat_ctx->module_ctx;
 
     event_log_info (hashcat_ctx, "Hash mode #%u", hashconfig->hash_mode);
     event_log_info (hashcat_ctx, "  Name................: %s", hashconfig->hash_name);
     event_log_info (hashcat_ctx, "  Category............: %s", strhashcategory (hashconfig->hash_category));
     event_log_info (hashcat_ctx, "  Slow.Hash...........: %s", (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? "No" : "Yes");
 
-    event_log_info (hashcat_ctx, "  Password.Len.Min....: %u", hashconfig->pw_min);
-    event_log_info (hashcat_ctx, "  Password.Len.Max....: %u", hashconfig->pw_max);
+    event_log_info (hashcat_ctx, "  Deprecated..........: %s", (module_ctx->module_deprecated_notice != MODULE_DEFAULT) ? "Yes" : "No");
+
+    char *t_deprecated_notice = "N/A\0";
+
+    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT)
+    {
+      t_deprecated_notice = (char *) module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
+    }
+
+    event_log_info (hashcat_ctx, "  Deprecated.Notice...: %s", t_deprecated_notice);
+
+    const char *t_pw_desc = (hashconfig->opts_type & OPTS_TYPE_PT_HEX) ? "HEX\0" : "plain\0";
+
+    u32 t_pw_min = hashconfig->pw_min;
+    u32 t_pw_max = hashconfig->pw_max;
+
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_PT_HEX)
+      {
+        t_pw_min *= 2;
+        t_pw_max *= 2;
+      }
+    }
+
+    event_log_info (hashcat_ctx, "  Password.Type.......: %s", t_pw_desc);
+    event_log_info (hashcat_ctx, "  Password.Len.Min....: %u", t_pw_min);
+    event_log_info (hashcat_ctx, "  Password.Len.Max....: %u", t_pw_max);
 
     if (hashconfig->is_salted == true)
     {
       u32 t = hashconfig->salt_type;
-      const char *t_desc = (t == SALT_TYPE_EMBEDDED) ? "Embedded\0" : (t == SALT_TYPE_GENERIC) ? "Generic\0" : "Virtual\0";
-      event_log_info (hashcat_ctx, "  Salt.Type...........: %s", t_desc);
-      event_log_info (hashcat_ctx, "  Salt.Len.Min........: %u", hashconfig->salt_min);
-      event_log_info (hashcat_ctx, "  Salt.Len.Max........: %u", hashconfig->salt_max);
-    }
 
-    // almost always 1 and -1
-    //event_log_info (hashcat_ctx, "  Hashes.Count.Min....: %d", hashconfig->hashes_count_min);
-    //event_log_info (hashcat_ctx, "  Hashes.Count.Max....: %u", hashconfig->hashes_count_max);
+      const char *t_salt_desc = (t == SALT_TYPE_EMBEDDED) ? "Embedded\0" : (t == SALT_TYPE_GENERIC) ? "Generic\0" : "Virtual\0";
+
+      event_log_info (hashcat_ctx, "  Salt.Type...........: %s", t_salt_desc);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        u32 t_salt_min = hashconfig->salt_min;
+        u32 t_salt_max = hashconfig->salt_max;
+
+        if (user_options->hash_info > 1)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_ST_HEX)
+          {
+            t_salt_min *= 2;
+            t_salt_max *= 2;
+          }
+        }
+
+        event_log_info (hashcat_ctx, "  Salt.Len.Min........: %u", t_salt_min);
+        event_log_info (hashcat_ctx, "  Salt.Len.Max........: %u", t_salt_max);
+      }
+    }
 
     if ((hashconfig->has_pure_kernel) && (hashconfig->has_optimized_kernel))
     {
@@ -870,6 +992,39 @@ void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_op
       event_log_info (hashcat_ctx, "  Kernel.Type(s)......: optimized");
     }
 
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        event_log_info (hashcat_ctx, "  Kernel.Type.Filter..: optimized");
+      }
+      else
+      {
+        event_log_info (hashcat_ctx, "  Kernel.Type.Filter..: pure");
+      }
+
+      event_log_info (hashcat_ctx, "  Attack.Mode.Filter..: %u", user_options->attack_mode);
+
+      // almost always 1 and -1
+      event_log_info (hashcat_ctx, "  Hashes.Count.Min....: %d", hashconfig->hashes_count_min);
+      event_log_info (hashcat_ctx, "  Hashes.Count.Max....: %d", hashconfig->hashes_count_max);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        bool multi_hash_same_salt = true;
+
+        if ((hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL) == 0)
+        {
+          if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+          {
+            multi_hash_same_salt = false;
+          }
+        }
+
+        event_log_info (hashcat_ctx, "  Hashes.w/.Same.Salt.: %s", (multi_hash_same_salt == true) ? "Allowed" : "Not allowed");
+      }
+    }
+
     if ((hashconfig->st_hash != NULL) && (hashconfig->st_pass != NULL))
     {
       if (hashconfig->opts_type & OPTS_TYPE_BINARY_HASHFILE)
@@ -973,6 +1128,7 @@ void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_op
     event_log_info (hashcat_ctx, "  Autodetect.Enabled..: %s", (hashconfig->opts_type & OPTS_TYPE_AUTODETECT_DISABLE) ? "No" : "Yes");
     event_log_info (hashcat_ctx, "  Self.Test.Enabled...: %s", (hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) ? "No" : "Yes");
     event_log_info (hashcat_ctx, "  Potfile.Enabled.....: %s", (hashconfig->opts_type & OPTS_TYPE_POTFILE_NOPASS) ? "No" : "Yes");
+    event_log_info (hashcat_ctx, "  Keep.Guessing.......: %s", (hashconfig->opts_type & OPTS_TYPE_SUGGEST_KG) ? "Yes" : "No");
     event_log_info (hashcat_ctx, "  Custom.Plugin.......: %s", (hashconfig->opts_type & OPTS_TYPE_STOCK_MODULE) ? "No" : "Yes");
 
     if (hashconfig->opts_type & OPTS_TYPE_PT_ALWAYS_ASCII)
@@ -1915,7 +2071,7 @@ void status_display_status_json (hashcat_ctx_t *hashcat_ctx)
       printf (",");
     }
 
-    printf (" { \"device_id\": %u,", device_id + 1);
+    printf (" { \"device_id\": %d,", device_id + 1);
 
     char *device_name_json_encoded = (char *) hcmalloc (strlen (device_info->device_name) * 2);
 
@@ -2419,18 +2575,10 @@ void status_display (hashcat_ctx_t *hashcat_ctx)
     }
     else
     {
-      event_log_info (hashcat_ctx,
-        "Remaining........: %u (%.2f%%) Digests",
-        digests_remain,
-        digests_remain_percent);
+      event_log_info (hashcat_ctx, "Remaining........: %u (%.2f%%) Digests", digests_remain, digests_remain_percent);
     }
-  }
 
-  if (hashcat_status->digests_cnt > 1000)
-  {
-    event_log_info (hashcat_ctx,
-      "Recovered/Time...: %s",
-      hashcat_status->cpt);
+    event_log_info (hashcat_ctx, "Recovered/Time...: %s", hashcat_status->cpt);
   }
 
   switch (hashcat_status->progress_mode)
@@ -2754,7 +2902,7 @@ void status_speed_json (hashcat_ctx_t *hashcat_ctx)
       printf (",");
     }
 
-    printf (" { \"device_id\": %u,", device_id + 1);
+    printf (" { \"device_id\": %d,", device_id + 1);
     printf (" \"speed\": %" PRIu64 " }", (u64) (device_info->hashes_msec_dev_benchmark * 1000));
     device_num++;
   }
@@ -2871,7 +3019,7 @@ void status_progress_json (hashcat_ctx_t *hashcat_ctx)
       printf (",");
     }
 
-    printf (" { \"device_id\": %u,", device_id + 1);
+    printf (" { \"device_id\": %d,", device_id + 1);
     printf (" \"progress\": %" PRIu64 ",", device_info->progress_dev);
     printf (" \"runtime\": %0.2f }", device_info->runtime_msec_dev);
     device_num++;
diff --git a/src/usage.c b/src/usage.c
index 7cbf22348..b72739f51 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -94,7 +94,7 @@ static const char *const USAGE_BIG_PRE_HASHMODES[] =
   "     --bitmap-max               | Num  | Sets maximum bits allowed for bitmaps to X           | --bitmap-max=24",
   "     --cpu-affinity             | Str  | Locks to CPU devices, separated with commas          | --cpu-affinity=1,2,3",
   "     --hook-threads             | Num  | Sets number of threads for a hook (per compute unit) | --hook-threads=8",
-  "     --hash-info                |      | Show information for each hash-mode                  |",
+  " -H, --hash-info                |      | Show information for each hash-mode                  | -H or -HH",
   "     --example-hashes           |      | Alias of --hash-info                                 |",
   "     --backend-ignore-cuda      |      | Do not try to open CUDA interface on startup         |",
   "     --backend-ignore-hip       |      | Do not try to open HIP interface on startup          |",
diff --git a/src/user_options.c b/src/user_options.c
index b1ed588f4..7adbcf345 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -21,9 +21,9 @@
 #endif
 
 #ifdef WITH_BRAIN
-static const char *const short_options = "hVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:z";
+static const char *const short_options = "hHVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:z";
 #else
-static const char *const short_options = "hVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:";
+static const char *const short_options = "hHVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:";
 #endif
 
 static char *const SEPARATOR = ":";
@@ -415,7 +415,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_ENCODING_TO:               user_options->encoding_to               = optarg;                          break;
       case IDX_INDUCTION_DIR:             user_options->induction_dir             = optarg;                          break;
       case IDX_OUTFILE_CHECK_DIR:         user_options->outfile_check_dir         = optarg;                          break;
-      case IDX_HASH_INFO:                 user_options->hash_info                 = true;                            break;
+      case IDX_HASH_INFO:                 user_options->hash_info++;                                                 break;
       case IDX_FORCE:                     user_options->force                     = true;                            break;
       case IDX_SELF_TEST_DISABLE:         user_options->self_test                 = false;                           break;
       case IDX_SKIP:                      user_options->skip                      = hc_strtoull (optarg, NULL, 10);
@@ -1291,7 +1291,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
       return -1;
     }
 
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
       event_log_error (hashcat_ctx, "Use of --hash-info is not allowed in benchmark mode.");
 
@@ -1476,6 +1476,13 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     return -1;
   }
 
+  if (user_options->hash_info > 2)
+  {
+    event_log_error (hashcat_ctx, "Invalid --hash-info/-H value, must have a value greater or equal to 0 and lower than 3.");
+
+    return -1;
+  }
+
   #ifdef WITH_BRAIN
   if ((user_options->brain_client == true) && (user_options->remove == true))
   {
@@ -1570,7 +1577,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
       show_error = false;
     }
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
     if (user_options->hc_argc == 0)
     {
@@ -1768,7 +1775,7 @@ void user_options_session_auto (hashcat_ctx_t *hashcat_ctx)
       user_options->session = "benchmark";
     }
 
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
       user_options->session = "hash_info";
     }
@@ -1851,12 +1858,12 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->bitmap_max          = 1;
   }
 
-  if (user_options->hash_info        == true
-   || user_options->keyspace         == true
+  if (user_options->keyspace         == true
    || user_options->speed_only       == true
    || user_options->progress_only    == true
    || user_options->identify         == true
    || user_options->usage             > 0
+   || user_options->hash_info         > 0
    || user_options->backend_info      > 0)
   {
     user_options->hwmon               = false;
@@ -1908,7 +1915,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (user_options->hash_info == true)
+  if (user_options->hash_info > 0)
   {
     user_options->quiet = true;
   }
@@ -2022,7 +2029,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->attack_mode == ATTACK_MODE_BF)
   {
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
 
     }
@@ -2278,7 +2285,7 @@ void user_options_extra_init (hashcat_ctx_t *hashcat_ctx)
   {
 
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
 
   }

From 310e9ee79a75738c6cd4723d88f367170afed10f Mon Sep 17 00:00:00 2001
From: Chick3nman <admin@chick3nman.com>
Date: Fri, 30 May 2025 14:13:43 -0500
Subject: [PATCH 02/37] Add --total-candidates flag and functionality

---
 docs/changes.txt   |  1 +
 include/types.h    |  4 ++++
 src/hashcat.c      |  1 +
 src/main.c         | 13 +++++++++++++
 src/terminal.c     | 13 +++++++------
 src/usage.c        |  1 +
 src/user_options.c | 20 ++++++++++++++++++++
 7 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index 8521d5591..c7ed132a2 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -66,6 +66,7 @@
 
 - Added new feature (-Y) that creates N virtual instances for each device in your system at the cost of N times the device memory consumption
 - Added options --benchmark-min and --benchmark-max to set a hash-mode range to be used during the benchmark
+- Added option --total-candidates to provide the total candidate count for an attack insteda of the internal "--keyspace" value
 
 ##
 ## Performance
diff --git a/include/types.h b/include/types.h
index 880af23ee..f6f82c992 100644
--- a/include/types.h
+++ b/include/types.h
@@ -107,6 +107,7 @@ typedef enum event_identifier
   EVENT_BITMAP_INIT_PRE           = 0x00000011,
   EVENT_BITMAP_FINAL_OVERFLOW     = 0x00000012,
   EVENT_CALCULATED_WORDS_BASE     = 0x00000020,
+  EVENT_CALCULATED_WORDS_CNT      = 0x00000021,
   EVENT_CRACKER_FINISHED          = 0x00000030,
   EVENT_CRACKER_HASH_CRACKED      = 0x00000031,
   EVENT_CRACKER_STARTING          = 0x00000032,
@@ -666,6 +667,7 @@ typedef enum user_options_defaults
   KERNEL_LOOPS             = 0,
   KERNEL_THREADS           = 0,
   KEYSPACE                 = false,
+  TOTAL_CANDIDATES         = false,
   LEFT                     = false,
   LIMIT                    = 0,
   LOGFILE                  = true,
@@ -843,6 +845,7 @@ typedef enum user_options_map
   IDX_STATUS_TIMER              = 0xff4c,
   IDX_STDOUT_FLAG               = 0xff4d,
   IDX_STDIN_TIMEOUT_ABORT       = 0xff4e,
+  IDX_TOTAL_CANDIDATES          = 0xff58,
   IDX_TRUECRYPT_KEYFILES        = 0xff4f,
   IDX_USERNAME                  = 0xff50,
   IDX_VERACRYPT_KEYFILES        = 0xff51,
@@ -2357,6 +2360,7 @@ typedef struct user_options
   bool         increment;
   bool         keep_guessing;
   bool         keyspace;
+  bool         total_candidates;
   bool         left;
   bool         logfile;
   bool         loopback;
diff --git a/src/hashcat.c b/src/hashcat.c
index e1607450c..a691d45aa 100644
--- a/src/hashcat.c
+++ b/src/hashcat.c
@@ -131,6 +131,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
   status_ctx->words_base = status_ctx->words_cnt / amplifier_cnt;
 
   EVENT (EVENT_CALCULATED_WORDS_BASE);
+  EVENT (EVENT_CALCULATED_WORDS_CNT);
 
   if (user_options->keyspace == true)
   {
diff --git a/src/main.c b/src/main.c
index 085ce38b4..6d21cf92f 100644
--- a/src/main.c
+++ b/src/main.c
@@ -370,10 +370,22 @@ static void main_calculated_words_base (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx,
   const user_options_t *user_options = hashcat_ctx->user_options;
 
   if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == true) return;
 
   event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_base);
 }
 
+static void main_calculated_words_cnt (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+{
+  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == false) return;
+
+  event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_cnt);
+}
+
 static void main_potfile_remove_parse_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
@@ -1171,6 +1183,7 @@ static void event (const u32 id, hashcat_ctx_t *hashcat_ctx, const void *buf, co
     case EVENT_BITMAP_INIT_PRE:           main_bitmap_init_pre           (hashcat_ctx, buf, len); break;
     case EVENT_BITMAP_FINAL_OVERFLOW:     main_bitmap_final_overflow     (hashcat_ctx, buf, len); break;
     case EVENT_CALCULATED_WORDS_BASE:     main_calculated_words_base     (hashcat_ctx, buf, len); break;
+    case EVENT_CALCULATED_WORDS_CNT:      main_calculated_words_cnt      (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_FINISHED:          main_cracker_finished          (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_HASH_CRACKED:      main_cracker_hash_cracked      (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_STARTING:          main_cracker_starting          (hashcat_ctx, buf, len); break;
diff --git a/src/terminal.c b/src/terminal.c
index d87faed19..5afbf5735 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -28,12 +28,13 @@ void welcome_screen (hashcat_ctx_t *hashcat_ctx, const char *version_tag)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
-  if (user_options->quiet       == true) return;
-  if (user_options->keyspace    == true) return;
-  if (user_options->stdout_flag == true) return;
-  if (user_options->show        == true) return;
-  if (user_options->left        == true) return;
-  if (user_options->identify    == true) return;
+  if (user_options->quiet       == true)      return;
+  if (user_options->keyspace    == true)      return;
+  if (user_options->total_candidates == true) return;
+  if (user_options->stdout_flag == true)      return;
+  if (user_options->show        == true)      return;
+  if (user_options->left        == true)      return;
+  if (user_options->identify    == true)      return;
 
   if (user_options->usage > 0)
   {
diff --git a/src/usage.c b/src/usage.c
index 870c790d0..612c34c04 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -120,6 +120,7 @@ static const char *const USAGE_BIG_PRE_HASHMODES[] =
   " -s, --skip                     | Num  | Skip X words from the start                          | -s 1000000",
   " -l, --limit                    | Num  | Limit X words from the start + skipped words         | -l 1000000",
   "     --keyspace                 |      | Show keyspace base:mod values and quit               |",
+  "     --total-candidates         |      | Show total candidate count (base*mod) and quit       |",
   " -j, --rule-left                | Rule | Single rule applied to each word from left wordlist  | -j 'c'",
   " -k, --rule-right               | Rule | Single rule applied to each word from right wordlist | -k '^-'",
   " -r, --rules-file               | File | Multiple rules applied to each word from wordlists   | -r rules/best64.rule",
diff --git a/src/user_options.c b/src/user_options.c
index 2adfc7ac2..8c0adeeb1 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -87,6 +87,7 @@ static const struct option long_options[] =
   {"kernel-threads",            required_argument, NULL, IDX_KERNEL_THREADS},
   {"keyboard-layout-mapping",   required_argument, NULL, IDX_KEYBOARD_LAYOUT_MAPPING},
   {"keyspace",                  no_argument,       NULL, IDX_KEYSPACE},
+  {"total-candidates",          no_argument,       NULL, IDX_TOTAL_CANDIDATES},
   {"left",                      no_argument,       NULL, IDX_LEFT},
   {"limit",                     required_argument, NULL, IDX_LIMIT},
   {"logfile-disable",           no_argument,       NULL, IDX_LOGFILE_DISABLE},
@@ -234,6 +235,7 @@ int user_options_init (hashcat_ctx_t *hashcat_ctx)
   user_options->kernel_threads            = KERNEL_THREADS;
   user_options->keyboard_layout_mapping   = NULL;
   user_options->keyspace                  = KEYSPACE;
+  user_options->total_candidates          = TOTAL_CANDIDATES;
   user_options->left                      = LEFT;
   user_options->limit                     = LIMIT;
   user_options->logfile                   = LOGFILE;
@@ -430,6 +432,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
                                           user_options->limit_chgd                = true;                            break;
       case IDX_KEEP_GUESSING:             user_options->keep_guessing             = true;                            break;
       case IDX_KEYSPACE:                  user_options->keyspace                  = true;                            break;
+      case IDX_TOTAL_CANDIDATES:          user_options->total_candidates          = true;                            break;
       case IDX_BENCHMARK:                 user_options->benchmark                 = true;                            break;
       case IDX_BENCHMARK_ALL:             user_options->benchmark_all             = true;                            break;
       case IDX_BENCHMARK_MAX:             user_options->benchmark_max             = hc_strtoul (optarg, NULL, 10);   break;
@@ -1816,6 +1819,11 @@ void user_options_session_auto (hashcat_ctx_t *hashcat_ctx)
       user_options->session = "progress_only";
     }
 
+    if (user_options->total_candidates == true)
+    {
+      user_options->session = "candidates";
+    }
+
     if (user_options->keyspace == true)
     {
       user_options->session = "keyspace";
@@ -1881,6 +1889,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->hash_info        == true
    || user_options->keyspace         == true
+   || user_options->total_candidates == true
    || user_options->speed_only       == true
    || user_options->progress_only    == true
    || user_options->identify         == true
@@ -1951,6 +1960,11 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->speed_only = true;
   }
 
+  if (user_options->total_candidates == true)
+  {
+    user_options->quiet = true;
+  }
+
   if (user_options->keyspace == true)
   {
     user_options->quiet = true;
@@ -1961,6 +1975,11 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->backend_vector_width = 1;
   }
 
+  if (user_options->total_candidates == true)
+  {
+    user_options->keyspace = true;
+  }
+
   if (user_options->stdout_flag == true)
   {
     user_options->force                 = true;
@@ -3303,6 +3322,7 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_uint   (user_options->kernel_loops);
   logfile_top_uint   (user_options->kernel_threads);
   logfile_top_uint   (user_options->keyspace);
+  logfile_top_uint   (user_options->total_candidates);
   logfile_top_uint   (user_options->left);
   logfile_top_uint   (user_options->logfile);
   logfile_top_uint   (user_options->loopback);

From 7fff4c929aee4542ead786cc1eb0f1a30e06f3f6 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sat, 28 Jun 2025 07:04:44 +0200
Subject: [PATCH 03/37] Fixed a division by zero bug in fast hashes caused by
 hashes->st_salts_buf->salt_iter not being used.

---
 src/autotune.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/autotune.c b/src/autotune.c
index df6d5529f..87637b29d 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -336,15 +336,20 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     {
       u32 start = kernel_loops_max;
 
-      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
-      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
+      const u32 salt_iter = hashes->st_salts_buf->salt_iter;
 
-      if ((hashes->st_salts_buf->salt_iter     % 125) == 0) start = MIN (start, 125);
-      if ((hashes->st_salts_buf->salt_iter + 1 % 125) == 0) start = MIN (start, 125);
-
-      if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+      if (salt_iter)
       {
-        kernel_loops_min_start = start;
+        start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
+        start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
+
+        if ((hashes->st_salts_buf->salt_iter     % 125) == 0) start = MIN (start, 125);
+        if ((hashes->st_salts_buf->salt_iter + 1 % 125) == 0) start = MIN (start, 125);
+
+        if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+        {
+          kernel_loops_min_start = start;
+        }
       }
     }
 
@@ -621,3 +626,4 @@ HC_API_CALL void *thread_autotune (void *p)
 
   return NULL;
 }
+

From 45b8672270ba9d0abf751e74da567fe42c861afb Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sat, 28 Jun 2025 10:42:47 +0200
Subject: [PATCH 04/37] add --machine-readable format to --backend-info

---
 src/terminal.c | 705 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 593 insertions(+), 112 deletions(-)

diff --git a/src/terminal.c b/src/terminal.c
index 948b4795d..2976bc7c2 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -1075,18 +1075,43 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
   const user_options_t  *user_options  = hashcat_ctx->user_options;
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
 
+  if (user_options->machine_readable == true)
+  {
+    printf ("{ ");
+  }
+
   if (user_options->backend_info > 1)
   {
-    event_log_info (hashcat_ctx, "System Info:");
-    event_log_info (hashcat_ctx, "============");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "System Info:");
+      event_log_info (hashcat_ctx, "============");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"SystemInfo\": { ");
+    }
 
     #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
     // TODO
-    event_log_info (hashcat_ctx, "OS.Name......: Windows");
-    event_log_info (hashcat_ctx, "OS.Release...: N/A");
-    event_log_info (hashcat_ctx, "HW.Platform..: N/A");
-    event_log_info (hashcat_ctx, "HW.Model.....: N/A");
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OS.Name......: Windows");
+      event_log_info (hashcat_ctx, "OS.Release...: N/A");
+      event_log_info (hashcat_ctx, "HW.Platform..: N/A");
+      event_log_info (hashcat_ctx, "HW.Model.....: N/A");
+    }
+    else
+    {
+      printf ("\"OS\": { ");
+      printf ("\"Name\": \"%s\", ", "Windows");
+      printf ("\"Release\": \"%s\" }, ", "N/A");
+      printf ("\"Hardware\": { ");
+      printf ("\"Platform\": \"%s\", ", "N/A");
+      printf ("\"Model\": \"%s\" } ", "N/A");
+      printf ("}, ");
+    }
     #else
 
     struct utsname utsbuf;
@@ -1123,10 +1148,23 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       rc_uname = true;
     }
 
-    event_log_info (hashcat_ctx, "OS.Name......: %s", (rc_uname  == true) ? utsbuf.sysname : "N/A");
-    event_log_info (hashcat_ctx, "OS.Release...: %s", (rc_uname  == true) ? utsbuf.release : "N/A");
-    event_log_info (hashcat_ctx, "HW.Model.....: %s", (rc_sysctl == true) ? hw_model_buf   : "N/A");
-    event_log_info (hashcat_ctx, "HW.Platform..: %s", (rc_uname  == true) ? utsbuf.machine : "N/A");
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OS.Name......: %s", (rc_uname  == true) ? utsbuf.sysname : "N/A");
+      event_log_info (hashcat_ctx, "OS.Release...: %s", (rc_uname  == true) ? utsbuf.release : "N/A");
+      event_log_info (hashcat_ctx, "HW.Platform..: %s", (rc_uname  == true) ? utsbuf.machine : "N/A");
+      event_log_info (hashcat_ctx, "HW.Model.....: %s", (rc_sysctl == true) ? hw_model_buf   : "N/A");
+    }
+    else
+    {
+      printf ("\"OS\": { ");
+      printf ("\"Name\": \"%s\", ", (rc_uname  == true) ? utsbuf.sysname : "N/A");
+      printf ("\"Release\": \"%s\" }, ", (rc_uname  == true) ? utsbuf.release : "N/A");
+      printf ("\"Hardware\": { ");
+      printf ("\"Platform\": \"%s\", ", (rc_uname  == true) ? utsbuf.machine : "N/A");
+      printf ("\"Model\": \"%s\" } ", (rc_sysctl == true) ? hw_model_buf : "N/A");
+      printf ("}, ");
+    }
 
     if (rc_sysctl == true)
     {
@@ -1134,38 +1172,72 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
     }
     #endif // _WIN || __CYGWIN__ || __MSYS__
 
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, NULL);
 
-    event_log_info (hashcat_ctx, "Environment Info:");
-    event_log_info (hashcat_ctx, "=================");
-    event_log_info (hashcat_ctx, NULL);
+      event_log_info (hashcat_ctx, "Environment Info:");
+      event_log_info (hashcat_ctx, "=================");
+      event_log_info (hashcat_ctx, NULL);
 
-    event_log_info (hashcat_ctx, "Cur.Work.Dir.: %s", folder_config->cwd);
-    event_log_info (hashcat_ctx, "Install.Dir..: %s", folder_config->install_dir);
-    event_log_info (hashcat_ctx, "Profile.Dir..: %s", folder_config->profile_dir);
-    event_log_info (hashcat_ctx, "Cache.Dir....: %s", folder_config->cache_dir);
-    // uninitialized at this point, for instance if the user uses --session
-    //event_log_info (hashcat_ctx, "Session.Dir..: %s", folder_config->session_dir);
-    event_log_info (hashcat_ctx, "Shared.Dir...: %s", folder_config->shared_dir);
-    event_log_info (hashcat_ctx, "CL.Inc.Path..: %s", folder_config->cpath_real);
+      event_log_info (hashcat_ctx, "Cur.Work.Dir.: %s", folder_config->cwd);
+      event_log_info (hashcat_ctx, "Install.Dir..: %s", folder_config->install_dir);
+      event_log_info (hashcat_ctx, "Profile.Dir..: %s", folder_config->profile_dir);
+      event_log_info (hashcat_ctx, "Cache.Dir....: %s", folder_config->cache_dir);
+      // uninitialized at this point, for instance if the user uses --session
+      //event_log_info (hashcat_ctx, "Session.Dir..: %s", folder_config->session_dir);
+      event_log_info (hashcat_ctx, "Shared.Dir...: %s", folder_config->shared_dir);
+      event_log_info (hashcat_ctx, "CL.Inc.Path..: %s", folder_config->cpath_real);
 
-    event_log_info (hashcat_ctx, NULL);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"EnvironmentInfo\": { ");
+      printf ("\"CurrentWorkingDirectory\": \"%s\", ", folder_config->cwd);
+      printf ("\"InstallDirectory\": \"%s\", ", folder_config->install_dir);
+      printf ("\"ProfileDirectory\": \"%s\", ", folder_config->profile_dir);
+      printf ("\"CacheDirectory\": \"%s\", ", folder_config->cache_dir);
+      printf ("\"SharedDirectory\": \"%s\", ", folder_config->shared_dir);
+      printf ("\"CLIncludePath\": \"%s\" ", folder_config->cpath_real);
+      printf ("}, ");
+    }
   }
 
   if (backend_ctx->cuda)
   {
-    event_log_info (hashcat_ctx, "CUDA Info:");
-    event_log_info (hashcat_ctx, "==========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "CUDA Info:");
+      event_log_info (hashcat_ctx, "==========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"CUDAInfo\": { ");
+    }
 
     int cuda_devices_cnt    = backend_ctx->cuda_devices_cnt;
     int cuda_driver_version = backend_ctx->cuda_driver_version;
 
-    event_log_info (hashcat_ctx, "CUDA.Version.: %u.%u", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "CUDA.Version.: %u.%u", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"Version\": \"%u.%u\", ", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+      printf ("\"BackendDevices\": [ ");
+    }
 
     for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_cuda[cuda_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1184,29 +1256,88 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-      event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+        event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+        printf ("\"PCI.Addr.BDFe\": \"%04x:%02x:%02x.%u\" ", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((cuda_devices_idx + 1) < cuda_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->hip || backend_ctx->mtl || backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
 
   if (backend_ctx->hip)
   {
-    event_log_info (hashcat_ctx, "HIP Info:");
-    event_log_info (hashcat_ctx, "=========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "HIP Info:");
+      event_log_info (hashcat_ctx, "=========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"HIPInfo\": { ");
+    }
 
     int hip_devices_cnt    = backend_ctx->hip_devices_cnt;
     int hip_runtimeVersion = backend_ctx->hip_runtimeVersion;
@@ -1217,17 +1348,41 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       int hip_version_minor = (hip_runtimeVersion - (hip_version_major * 10000000)) / 100000;
       int hip_version_patch = (hip_runtimeVersion - (hip_version_major * 10000000) - (hip_version_minor * 100000));
 
-      event_log_info (hashcat_ctx, "HIP.Version.: %u.%u.%u", hip_version_major, hip_version_minor, hip_version_patch);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "HIP.Version.: %u.%u.%u", hip_version_major, hip_version_minor, hip_version_patch);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Version\": \"%u.%u.%u\", ", hip_version_major, hip_version_minor, hip_version_patch);
+      }
     }
     else
     {
-      event_log_info (hashcat_ctx, "HIP.Version.: %u.%u", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "HIP.Version.: %u.%u", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Version\": \"%u.%u\", ", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("\"BackendDevices\": [ ");
     }
 
     for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_hip[hip_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1246,40 +1401,116 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-      event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+        event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+        printf ("\"PCI.Addr.BDFe\": \"%04x:%02x:%02x.%u\" ", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((hip_devices_idx + 1) < hip_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->mtl || backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
 
   #if defined (__APPLE__)
   if (backend_ctx->mtl)
   {
-    event_log_info (hashcat_ctx, "Metal Info:");
-    event_log_info (hashcat_ctx, "===========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "Metal Info:");
+      event_log_info (hashcat_ctx, "===========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"MetalInfo\": { ");
+    }
 
     int metal_devices_cnt = backend_ctx->metal_devices_cnt;
 
     char *metal_runtimeVersionStr = backend_ctx->metal_runtimeVersionStr;
 
-    event_log_info (hashcat_ctx, "Metal.Version.: %s", metal_runtimeVersionStr);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "Metal.Version.: %s", metal_runtimeVersionStr);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"Version\": \"%s\", ", metal_runtimeVersionStr);
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("\"BackendDevices\": [ ");
+    }
 
     for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_metal[metal_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1310,30 +1541,111 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
-      event_log_info (hashcat_ctx, "  Vendor.ID......: %u", opencl_device_vendor_id);
-      event_log_info (hashcat_ctx, "  Vendor.........: %s", opencl_device_vendor);
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: N/A");
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+        event_log_info (hashcat_ctx, "  Vendor.ID......: %u", opencl_device_vendor_id);
+        event_log_info (hashcat_ctx, "  Vendor.........: %s", opencl_device_vendor);
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: N/A");
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+      }
+      else
+      {
+        printf ("\"Type\": \"%s\", ", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+        printf ("\"VendorID\": \"%u\", ", opencl_device_vendor_id);
+        printf ("\"Vendor\": \"%s\", ", opencl_device_vendor);
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%s\", ", "N/A");
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryAllocPerBlock\": \"%" PRIu64 " MB\", ", device_maxmem_alloc / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+      }
 
       switch (device_physical_location)
       {
-        case MTL_DEVICE_LOCATION_BUILTIN:     event_log_info (hashcat_ctx, "  Phys.Location..: built-in"); break;
-        case MTL_DEVICE_LOCATION_SLOT:        event_log_info (hashcat_ctx, "  Phys.Location..: connected to slot %u", device_location_number); break;
-        case MTL_DEVICE_LOCATION_EXTERNAL:    event_log_info (hashcat_ctx, "  Phys.Location..: connected via an external interface (port %u)", device_location_number); break;
-        case MTL_DEVICE_LOCATION_UNSPECIFIED: event_log_info (hashcat_ctx, "  Phys.Location..: unspecified"); break;
-        default:                              event_log_info (hashcat_ctx, "  Phys.Location..: N/A"); break;
+        case MTL_DEVICE_LOCATION_BUILTIN:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: built-in");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"built-in\", ");
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_SLOT:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: connected to slot %u", device_location_number);
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"connected to slot %u\", ", device_location_number);
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_EXTERNAL:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: connected via an external interface (port %u)", device_location_number);
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"connected via an external interface (port %u)\", ", device_location_number);
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_UNSPECIFIED:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: unspecified");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"unspecified\", ");
+          }
+
+          break;
+        default:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: N/A");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"%s\", ", "N/A");
+          }
+
+          break;
       }
 
       /*
@@ -1347,28 +1659,92 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       }
       */
 
-      event_log_info (hashcat_ctx, "  Registry.ID....: %u", device_registryID);
-
-      if (device_physical_location != MTL_DEVICE_LOCATION_BUILTIN)
+      if (user_options->machine_readable == false)
       {
-        event_log_info (hashcat_ctx, "  Max.TX.Rate....: %u MB/sec", device_max_transfer_rate);
+        event_log_info (hashcat_ctx, "  Registry.ID....: %u", device_registryID);
       }
       else
       {
-        event_log_info (hashcat_ctx, "  Max.TX.Rate....: N/A");
+        printf ("\"RegistryID\": \"%u\", ", device_registryID);
       }
 
-      event_log_info (hashcat_ctx, "  GPU.Properties.: headless %u, low-power %u, removable %u", device_is_headless, device_is_low_power, device_is_removable);
-      event_log_info (hashcat_ctx, NULL);
+      if (device_physical_location != MTL_DEVICE_LOCATION_BUILTIN)
+      {
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "  Max.TX.Rate....: %u MB/sec", device_max_transfer_rate);
+        }
+        else
+        {
+          printf ("\"MaxTXRate\": \"%u MB/sec\", ", device_max_transfer_rate);
+        }
+      }
+      else
+      {
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "  Max.TX.Rate....: N/A");
+        }
+        else
+        {
+          printf ("\"MaxTXRate\": \"%s\", ", "N/A");
+        }
+      }
+
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  GPU.Properties.: headless %u, low-power %u, removable %u", device_is_headless, device_is_low_power, device_is_removable);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"GPUProperties\": { ");
+        printf ("\"headless\": \"%u\", ", device_is_headless);
+        printf ("\"low_power\": \"%u\", ", device_is_low_power);
+        printf ("\"removable\": \"%u\" ", device_is_removable);
+        printf ("} ");
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((metal_devices_idx + 1) < metal_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
   #endif
 
   if (backend_ctx->ocl)
   {
-    event_log_info (hashcat_ctx, "OpenCL Info:");
-    event_log_info (hashcat_ctx, "============");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OpenCL Info:");
+      event_log_info (hashcat_ctx, "============");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"OpenCLInfo\": { ");
+      printf ("\"Platforms\": [ ");
+    }
 
     cl_uint   opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
     cl_uint  *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
@@ -1378,19 +1754,44 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
     for (cl_uint opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       char     *opencl_platform_vendor       = opencl_platforms_vendor[opencl_platforms_idx];
       char     *opencl_platform_name         = opencl_platforms_name[opencl_platforms_idx];
       char     *opencl_platform_version      = opencl_platforms_version[opencl_platforms_idx];
       cl_uint   opencl_platform_devices_cnt  = opencl_platforms_devices_cnt[opencl_platforms_idx];
 
-      event_log_info (hashcat_ctx, "OpenCL Platform ID #%u", opencl_platforms_idx + 1);
-      event_log_info (hashcat_ctx, "  Vendor..: %s",  opencl_platform_vendor);
-      event_log_info (hashcat_ctx, "  Name....: %s",  opencl_platform_name);
-      event_log_info (hashcat_ctx, "  Version.: %s",  opencl_platform_version);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "OpenCL Platform ID #%u", opencl_platforms_idx + 1);
+        event_log_info (hashcat_ctx, "  Vendor..: %s",  opencl_platform_vendor);
+        event_log_info (hashcat_ctx, "  Name....: %s",  opencl_platform_name);
+        event_log_info (hashcat_ctx, "  Version.: %s",  opencl_platform_version);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"PlatformID\": \"%u\", ", opencl_platforms_idx + 1);
+        printf ("\"Vendor\": \"%s\", ", opencl_platform_vendor);
+        printf ("\"Name\": \"%s\", ", opencl_platform_name);
+        printf ("\"Version\": \"%s\", ", opencl_platform_version);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        printf ("\"BackendDevices\": [ ");
+      }
 
       for (cl_uint opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
       {
+        if (user_options->machine_readable == true)
+        {
+          printf ("{ ");
+        }
+
         const int backend_devices_idx = backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx];
 
         const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1412,25 +1813,58 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
         if (device_param->device_id_alias_cnt)
         {
-          event_log_info (hashcat_ctx, "  Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+          }
+          else
+          {
+            printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+            printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+          }
         }
         else
         {
-          event_log_info (hashcat_ctx, "  Backend Device ID #%02u", device_id + 1);
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Backend Device ID #%02u", device_id + 1);
+          }
+          else
+          {
+            printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          }
         }
 
-        event_log_info (hashcat_ctx, "    Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
-        event_log_info (hashcat_ctx, "    Vendor.ID......: %u", opencl_device_vendor_id);
-        event_log_info (hashcat_ctx, "    Vendor.........: %s", opencl_device_vendor);
-        event_log_info (hashcat_ctx, "    Name...........: %s", device_name);
-        event_log_info (hashcat_ctx, "    Version........: %s", opencl_device_version);
-        event_log_info (hashcat_ctx, "    Processor(s)...: %u", device_processors);
-        event_log_info (hashcat_ctx, "    Clock..........: %u", device_maxclock_frequency);
-        event_log_info (hashcat_ctx, "    Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
-        event_log_info (hashcat_ctx, "    Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-        event_log_info (hashcat_ctx, "    Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-        event_log_info (hashcat_ctx, "    OpenCL.Version.: %s", opencl_device_c_version);
-        event_log_info (hashcat_ctx, "    Driver.Version.: %s", opencl_driver_version);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "    Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+          event_log_info (hashcat_ctx, "    Vendor.ID......: %u", opencl_device_vendor_id);
+          event_log_info (hashcat_ctx, "    Vendor.........: %s", opencl_device_vendor);
+          event_log_info (hashcat_ctx, "    Name...........: %s", device_name);
+          event_log_info (hashcat_ctx, "    Version........: %s", opencl_device_version);
+          event_log_info (hashcat_ctx, "    Processor(s)...: %u", device_processors);
+          event_log_info (hashcat_ctx, "    Clock..........: %u", device_maxclock_frequency);
+          event_log_info (hashcat_ctx, "    Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
+          event_log_info (hashcat_ctx, "    Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+          event_log_info (hashcat_ctx, "    Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+          event_log_info (hashcat_ctx, "    OpenCL.Version.: %s", opencl_device_c_version);
+          event_log_info (hashcat_ctx, "    Driver.Version.: %s", opencl_driver_version);
+        }
+        else
+        {
+          printf ("\"Type\": \"%s\", ", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+          printf ("\"VendorID\": \"%u\", ", opencl_device_vendor_id);
+          printf ("\"Vendor\": \"%s\", ", opencl_device_vendor);
+          printf ("\"Name\": \"%s\", ", device_name);
+          printf ("\"Processor(s)\": \"%u\", ", device_processors);
+          printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+          printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+          printf ("\"MemoryAllocPerBlock\": \"%" PRIu64 " MB\", ", device_maxmem_alloc / 1024 / 1024);
+          printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+          printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+          printf ("\"OpenCLVersion\": \"%s\", ", opencl_device_c_version);
+          printf ("\"DriverVersion\": \"%s\" ", opencl_device_version);
+        }
 
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
@@ -1440,18 +1874,68 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
-            event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            if (user_options->machine_readable == false)
+            {
+              event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            }
+            else
+            {
+              printf (", \"PCI.Addr.BDF\": \"%02x:%02x.%u\" ", pcie_bus, pcie_device, pcie_function);
+            }
           }
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
           {
-            event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            if (user_options->machine_readable == false)
+            {
+              event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            }
+            else
+            {
+              printf (", \"PCI.Addr.BDF\": \"%02x:%02x.%u\" ", pcie_bus, pcie_device, pcie_function);
+            }
           }
         }
 
-        event_log_info (hashcat_ctx, NULL);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, NULL);
+        }
+        else
+        {
+          if ((opencl_platform_devices_idx + 1) < opencl_platform_devices_cnt)
+          {
+            printf ("}, ");
+          }
+          else
+          {
+            printf ("} ");
+          }
+        }
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((opencl_platforms_idx + 1) < opencl_platforms_cnt)
+        {
+          printf ("] }, ");
+        }
+        else
+        {
+          printf ("] } ");
+        }
       }
     }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("] } ");
+    }
+  }
+
+  if (user_options->machine_readable == true)
+  {
+    printf ("}");
   }
 }
 
@@ -2676,10 +3160,7 @@ void status_display (hashcat_ctx_t *hashcat_ctx)
         digests_remain,
         digests_remain_percent);
     }
-  }
 
-  if (hashcat_status->digests_cnt > 1000)
-  {
     event_log_info (hashcat_ctx,
       "Recovered/Time...: %s",
       hashcat_status->cpt);

From 92b2f996b29fd72f79b82fae18d774b4f683d379 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sat, 28 Jun 2025 10:45:24 +0200
Subject: [PATCH 05/37] update docs/changes.txt

---
 docs/changes.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/changes.txt b/docs/changes.txt
index d9ff56f2b..5ff9f0f63 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -125,6 +125,7 @@
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
+- Backend Info: Added --machine-readable format
 - Brain: Added sanity check and corresponding error message for invalid --brain-port values
 - Dependencies: Added sse2neon v1.8.0 (commit 658eeac)
 - Dependencies: Updated LZMA SDK to 24.09

From a4bcde8aeddd4579e6950a716a8e9b598ce0792e Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sat, 28 Jun 2025 11:16:39 +0200
Subject: [PATCH 06/37] fix json format on hash_info_single_json

---
 src/terminal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/terminal.c b/src/terminal.c
index f15a0f87d..4a9fc45cb 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -726,7 +726,7 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
       }
     }
 
-    printf ("\"password_type\": %s, ", t_pw_desc);
+    printf ("\"password_type\": \"%s\", ", t_pw_desc);
     printf ("\"password_len_min\": %u, ", t_pw_min);
     printf ("\"password_len_max\": %u, ", t_pw_max);
 

From c275c35cedd9817e237652c06af48cdab46a9a8f Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sat, 28 Jun 2025 22:54:36 +0200
Subject: [PATCH 07/37] workaround for HIP bug and avoiding a potential same
 bug on CUDA

---
 src/backend.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/backend.c b/src/backend.c
index 15e4badbb..fc0051dd4 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -15948,6 +15948,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         u32 threads_per_block_with_regs = (floor) ((float) device_param->regsPerBlock / num_regs);
 
+        if (threads_per_block_with_regs == 0)
+        {
+          // prevent threads_per_block from resulting in 0 due to a bug on the runtime
+          threads_per_block_with_regs = threads_per_block;
+        }
+
         if (threads_per_block_with_regs > device_param->kernel_preferred_wgs_multiple) threads_per_block_with_regs -= threads_per_block_with_regs % device_param->kernel_preferred_wgs_multiple;
 
         threads_per_block = MIN (threads_per_block, threads_per_block_with_regs);
@@ -15967,6 +15973,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         u32 threads_per_block_with_regs = (floor) ((float) device_param->regsPerBlock / num_regs);
 
+        if (threads_per_block_with_regs == 0)
+        {
+          // https://rocm.docs.amd.com/projects/HIP/en/docs-develop/doxygen/html/bug.html
+          // HIP-Clang always returns 0 for regsPerBlock due to a known bug
+          // prevent threads_per_block from resulting in 0, otherwise hashcat crashes
+          threads_per_block_with_regs = threads_per_block;
+        }
+
         if (threads_per_block_with_regs > device_param->kernel_preferred_wgs_multiple) threads_per_block_with_regs -= threads_per_block_with_regs % device_param->kernel_preferred_wgs_multiple;
 
         threads_per_block = MIN (threads_per_block, threads_per_block_with_regs);

From 7e2c65cc98398f1fcf4026e4374ec3e0f56dca0f Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 13:28:31 +0200
Subject: [PATCH 08/37] Backend: Splitting backend_ctx_devices_init into
 smaller runtime-specific functions

---
 docs/changes.txt |   1 +
 src/backend.c    | 210 +++++++++++++++++++++++++++++------------------
 2 files changed, 131 insertions(+), 80 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index cc75fe622..1c2f56a75 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -127,6 +127,7 @@
 - Apple Driver: Automatically enable GPU support on Apple OpenCL instead of CPU support
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
+- Backend: Splitting backend_ctx_devices_init into smaller runtime-specific functions
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
 - Brain: Added sanity check and corresponding error message for invalid --brain-port values
 - Dependencies: Added sse2neon v1.8.0 (commit 658eeac)
diff --git a/src/backend.c b/src/backend.c
index fc0051dd4..23baa8b25 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -5418,37 +5418,13 @@ void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (backend_ctx, 0, sizeof (backend_ctx_t));
 }
 
-int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+int backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_nvml, MAYBE_UNUSED bool *need_nvapi)
 {
-  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
-  const folder_config_t *folder_config = hashcat_ctx->folder_config;
-        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
-        user_options_t  *user_options  = hashcat_ctx->user_options;
-
-  if (backend_ctx->enabled == false) return 0;
+  backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
+  user_options_t    *user_options  = hashcat_ctx->user_options;
 
   hc_device_param_t *devices_param = backend_ctx->devices_param;
 
-  bool need_adl           = false;
-  bool need_nvml          = false;
-  bool need_nvapi         = false;
-  bool need_sysfs_amdgpu  = false;
-  bool need_sysfs_cpu     = false;
-  bool need_iokit         = false;
-
-  int bridge_link_device = 0; // this will only count active device
-
-  int backend_devices_idx = 0; // this will not only count active devices
-
-  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
-
-  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
-
-  int virthost = -1;
-  int virthost_finder = user_options->backend_devices_virthost;
-
-  // CUDA
-
   int cuda_devices_cnt    = 0;
   int cuda_devices_active = 0;
 
@@ -5463,15 +5439,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= cuda_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= cuda_devices_cnt))
       {
         cuda_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= cuda_devices_cnt;
+        *virthost_finder -= cuda_devices_cnt;
 
         cuda_devices_cnt = 0;
       }
@@ -5481,17 +5457,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, backend_devices_idx++)
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 cuda_devices_idx_real = (is_virtualized == true) ? virthost : cuda_devices_idx;
+      const u32 cuda_devices_idx_real = (is_virtualized == true) ? *virthost : cuda_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = *backend_devices_idx;
 
       CUdevice cuda_device;
 
@@ -5791,10 +5767,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
       {
-        need_nvml = true;
+        *need_nvml = true;
 
         #if defined (_WIN) || defined (__CYGWIN__)
-        need_nvapi = true;
+        *need_nvapi = true;
         #endif
       }
 
@@ -5906,7 +5882,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         cuda_devices_active++;
       }
@@ -5916,7 +5892,18 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
   backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
   backend_ctx->cuda_devices_active  = cuda_devices_active;
 
-  // HIP
+  return 0;
+}
+
+int backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_adl, MAYBE_UNUSED bool *need_sysfs_amdgpu)
+{
+  #if defined (__linux__)
+  const folder_config_t *folder_config = hashcat_ctx->folder_config;
+  #endif
+  backend_ctx_t         *backend_ctx   = hashcat_ctx->backend_ctx;
+  user_options_t        *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
 
   int hip_devices_cnt    = 0;
   int hip_devices_active = 0;
@@ -5932,15 +5919,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= hip_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= hip_devices_cnt))
       {
         hip_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= hip_devices_cnt;
+        *virthost_finder -= hip_devices_cnt;
 
         hip_devices_cnt = 0;
       }
@@ -5950,17 +5937,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++, backend_devices_idx++)
+    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 hip_devices_idx_real = (is_virtualized == true) ? virthost : hip_devices_idx;
+      const u32 hip_devices_idx_real = (is_virtualized == true) ? *virthost : hip_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_hip[hip_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_hip[hip_devices_idx] = *backend_devices_idx;
 
       hipDevice_t hip_device;
 
@@ -6275,10 +6262,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD_USE_HIP) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
       {
-         need_adl = true;
+         *need_adl = true;
 
          #if defined (__linux__)
-         need_sysfs_amdgpu = true;
+         *need_sysfs_amdgpu = true;
          #endif
       }
 
@@ -6404,7 +6391,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         hip_devices_active++;
       }
@@ -6414,12 +6401,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
   backend_ctx->hip_devices_cnt     = hip_devices_cnt;
   backend_ctx->hip_devices_active  = hip_devices_active;
 
-  // Metal
+  return 0;
+}
+
+int backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED bool is_virtualized, MAYBE_UNUSED int virtmulti, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device, MAYBE_UNUSED bool *need_iokit)
+{
+  backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
 
   int metal_devices_cnt    = 0;
   int metal_devices_active = 0;
 
   #if defined (__APPLE__)
+  hc_device_param_t *devices_param = backend_ctx->devices_param;
+
   if (backend_ctx->mtl)
   {
     // device count
@@ -6431,15 +6425,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= metal_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= metal_devices_cnt))
       {
         metal_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= metal_devices_cnt;
+        *virthost_finder -= metal_devices_cnt;
 
         metal_devices_cnt = 0;
       }
@@ -6449,17 +6443,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++, backend_devices_idx++)
+    for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 metal_devices_idx_real = (is_virtualized == true) ? virthost : metal_devices_idx;
+      const u32 metal_devices_idx_real = (is_virtualized == true) ? *virthost : metal_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_metal[metal_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_metal[metal_devices_idx] = *backend_devices_idx;
 
       mtl_device_id metal_device = NULL;
 
@@ -6789,7 +6783,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_APPLE))
       {
-        need_iokit = true;
+        *need_iokit = true;
       }
 
       // CPU burning loop damper
@@ -6831,7 +6825,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         metal_devices_active++;
       }
@@ -6842,7 +6836,16 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
   backend_ctx->metal_devices_cnt     = metal_devices_cnt;
   backend_ctx->metal_devices_active  = metal_devices_active;
 
-  // OCL
+  return 0;
+}
+
+int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_nvml, MAYBE_UNUSED bool *need_nvapi, bool *need_adl, MAYBE_UNUSED bool *need_sysfs_amdgpu, MAYBE_UNUSED bool *need_sysfs_cpu, MAYBE_UNUSED bool *need_iokit)
+{
+  const folder_config_t *folder_config = hashcat_ctx->folder_config;
+  backend_ctx_t         *backend_ctx   = hashcat_ctx->backend_ctx;
+  user_options_t        *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
 
   int opencl_devices_cnt    = 0;
   int opencl_devices_active = 0;
@@ -6868,15 +6871,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (is_virtualized == true)
       {
-        if ((virthost == -1) && (virthost_finder <= (int) opencl_platform_devices_cnt))
+        if ((*virthost == -1) && (*virthost_finder <= (int) opencl_platform_devices_cnt))
         {
           opencl_platform_devices_cnt = virtmulti;
 
-          virthost = virthost_finder - 1;
+          *virthost = *virthost_finder - 1;
         }
         else
         {
-          virthost_finder -= (int) opencl_platform_devices_cnt;
+          *virthost_finder -= (int) opencl_platform_devices_cnt;
 
           opencl_platform_devices_cnt = 0;
         }
@@ -6884,21 +6887,21 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         opencl_platforms_devices_cnt[opencl_platforms_idx] = opencl_platform_devices_cnt;
       }
 
-      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, backend_devices_idx++, opencl_devices_cnt++)
+      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, (*backend_devices_idx)++, opencl_devices_cnt++)
       {
-        const u32 device_id = backend_devices_idx;
+        const u32 device_id = *backend_devices_idx;
 
         hc_device_param_t *device_param = &devices_param[device_id];
 
         device_param->device_id = device_id;
 
-        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = backend_devices_idx;
+        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = *backend_devices_idx;
 
-        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = backend_devices_idx;
+        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = *backend_devices_idx;
 
         device_param->opencl_platform_vendor_id = opencl_platform_vendor_id;
 
-        device_param->opencl_device = opencl_platform_devices[(is_virtualized == true) ? virthost : (int) opencl_platform_devices_idx];
+        device_param->opencl_device = opencl_platform_devices[(is_virtualized == true) ? *virthost : (int) opencl_platform_devices_idx];
 
         //device_param->opencl_platform = opencl_platform;
 
@@ -7688,12 +7691,12 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           #if defined (__APPLE__)
           if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
           {
-            need_iokit = true;
+            *need_iokit = true;
           }
           #endif
 
           #if defined (__linux__)
-          need_sysfs_cpu = true;
+          *need_sysfs_cpu = true;
           #endif
         }
 
@@ -7701,19 +7704,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         {
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
-            need_adl = true;
+            *need_adl = true;
 
             #if defined (__linux__)
-            need_sysfs_amdgpu = true;
+            *need_sysfs_amdgpu = true;
             #endif
           }
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
           {
-            need_nvml = true;
+            *need_nvml = true;
 
             #if defined (_WIN) || defined (__CYGWIN__)
-            need_nvapi = true;
+            *need_nvapi = true;
             #endif
           }
 
@@ -7722,7 +7725,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           {
             if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
             {
-              need_iokit = true;
+              *need_iokit = true;
             }
           }
           #endif
@@ -8231,7 +8234,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
            * activate device
            */
 
-          device_param->bridge_link_device = bridge_link_device++;
+          device_param->bridge_link_device = (*bridge_link_device)++;
 
           opencl_devices_active++;
         }
@@ -8242,10 +8245,57 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
   backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
   backend_ctx->opencl_devices_active  = opencl_devices_active;
 
+  return 0;
+}
+
+int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+{
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  hc_device_param_t *devices_param = backend_ctx->devices_param;
+
+  bool need_adl           = false;
+  bool need_nvml          = false;
+  bool need_nvapi         = false;
+  bool need_sysfs_amdgpu  = false;
+  bool need_sysfs_cpu     = false;
+  bool need_iokit         = false;
+
+  int bridge_link_device = 0; // this will only count active device
+
+  int backend_devices_idx = 0; // this will not only count active devices
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
+
+  int virthost = -1;
+  int virthost_finder = user_options->backend_devices_virthost;
+
+  // CUDA
+
+  backend_ctx_devices_init_cuda (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_nvml, &need_nvapi);
+
+  // HIP
+
+  backend_ctx_devices_init_hip (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_adl, &need_sysfs_amdgpu);
+
+  // Metal
+
+  backend_ctx_devices_init_metal (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_iokit);
+
+  // OCL
+
+  backend_ctx_devices_init_opencl (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_nvml, &need_nvapi, &need_adl, &need_sysfs_amdgpu, &need_sysfs_cpu, &need_iokit);
+
   // all devices combined go into backend_* variables
 
-  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + hip_devices_cnt    + metal_devices_cnt    + opencl_devices_cnt;
-  backend_ctx->backend_devices_active = cuda_devices_active + hip_devices_active + metal_devices_active + opencl_devices_active;
+  backend_ctx->backend_devices_cnt    = backend_ctx->cuda_devices_cnt    + backend_ctx->hip_devices_cnt    + backend_ctx->metal_devices_cnt    + backend_ctx->opencl_devices_cnt;
+  backend_ctx->backend_devices_active = backend_ctx->cuda_devices_active + backend_ctx->hip_devices_active + backend_ctx->metal_devices_active + backend_ctx->opencl_devices_active;
 
   #if defined (__APPLE__)
   // disable Metal devices if at least one OpenCL device is enabled

From 0c2ed0d1991e97c1ae76006adfee656e637a93f2 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sun, 29 Jun 2025 14:39:14 +0200
Subject: [PATCH 09/37] Update plugins that benefit from an artificially
 limited register count (NVIDIA). Update default hash settings to 64MiB:3:4
 for Argon2 in -m 70000, following RFC 9106 recommendations. Add option
 OPTS_TYPE_THREAD_MULTI_DISABLE: allows plugin developers to disable scaling
 the password candidate batch size based on device thread count. This can be
 useful for super slow hash algorithms that utilize threads differently, e.g.,
 when the algorithm allows parallelization. Note: thread count for the device
 can still be set normally. Add options
 OPTI_TYPE_SLOW_HASH_DIMY_INIT/LOOP/COMP: enable 2D launches for slow hash
 init/loop/comp kernel with dimensions X and Y. The Y value must be set via
 salt->salt_dimy attribute. Change autotune kernel-loops start value to the
 lowest multiple of the target hash iteration count, if kernel_loops_min
 permits. Fixed a bug in autotune where kernel_threads_max was not respected
 during initial init and loop-prepare kernel runs.

---
 OpenCL/inc_types.h           |   1 +
 include/types.h              |  18 ++++---
 src/autotune.c               |  22 ++++----
 src/backend.c                | 101 +++++++++++++++++++++++++++++------
 src/modules/module_01800.c   |   1 +
 src/modules/module_06400.c   |   1 +
 src/modules/module_06800.c   |   1 +
 src/modules/module_07700.c   |   1 +
 src/modules/module_07701.c   |   1 +
 src/modules/module_08300.c   |   3 +-
 src/modules/module_13751.c   |   3 +-
 src/modules/module_13752.c   |   3 +-
 src/modules/module_13753.c   |   3 +-
 src/modules/module_13761.c   |   3 +-
 src/modules/module_13762.c   |   3 +-
 src/modules/module_13763.c   |   3 +-
 src/modules/module_14800.c   |   1 +
 src/modules/module_14900.c   |   3 +-
 src/modules/module_15600.c   |   1 +
 src/modules/module_16300.c   |   1 +
 src/modules/module_16700.c   |   1 +
 src/modules/module_16900.c   |   1 +
 src/modules/module_18100.c   |   1 +
 src/modules/module_20800.c   |   1 +
 src/modules/module_21300.c   |   1 +
 src/modules/module_21500.c   |   1 +
 src/modules/module_21700.c   |   1 +
 src/modules/module_22100.c   |   3 +-
 src/modules/module_22300.c   |   1 +
 src/modules/module_22921.c   |   3 +-
 src/modules/module_22941.c   |   3 +-
 src/modules/module_23400.c   |   1 +
 src/modules/module_23600.c   |   1 +
 src/modules/module_23800.c   |   3 +-
 src/modules/module_24200.c   |   1 +
 src/modules/module_24420.c   |   1 +
 src/modules/module_25500.c   |   1 +
 src/modules/module_25900.c   |   3 +-
 src/modules/module_26000.c   |   1 +
 src/modules/module_26100.c   |   1 +
 src/modules/module_26600.c   |   1 +
 src/modules/module_26700.c   |   3 +-
 src/modules/module_26800.c   |   3 +-
 src/modules/module_26900.c   |   1 +
 src/modules/module_27300.c   |   1 +
 src/modules/module_27400.c   |   1 +
 src/modules/module_27500.c   |   1 +
 src/modules/module_27600.c   |   1 +
 src/modules/module_29451.c   |   3 +-
 src/modules/module_29452.c   |   3 +-
 src/modules/module_29453.c   |   3 +-
 src/modules/module_29461.c   |   3 +-
 src/modules/module_29462.c   |   3 +-
 src/modules/module_29463.c   |   3 +-
 src/modules/module_70000.c   |   2 +-
 src/shared.c                 |   6 +++
 tools/test_modules/m70000.pm |   2 +-
 57 files changed, 190 insertions(+), 54 deletions(-)

diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h
index 233e28958..a13c89b8e 100644
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@@ -2008,6 +2008,7 @@ typedef struct salt
   u32 salt_len_pc;
   u32 salt_iter;
   u32 salt_iter2;
+  u32 salt_dimy;
   u32 salt_sign[2];
   u32 salt_repeats;
 
diff --git a/include/types.h b/include/types.h
index 22c57d85a..600e7bd93 100644
--- a/include/types.h
+++ b/include/types.h
@@ -412,6 +412,9 @@ typedef enum opti_type
   OPTI_TYPE_REGISTER_LIMIT        = (1 << 20), // We'll limit the register count to 128
   OPTI_TYPE_SLOW_HASH_SIMD_INIT2  = (1 << 21),
   OPTI_TYPE_SLOW_HASH_SIMD_LOOP2  = (1 << 22),
+  OPTI_TYPE_SLOW_HASH_DIMY_INIT   = (1 << 23),
+  OPTI_TYPE_SLOW_HASH_DIMY_LOOP   = (1 << 24),
+  OPTI_TYPE_SLOW_HASH_DIMY_COMP   = (1 << 25),
 
 } opti_type_t;
 
@@ -476,14 +479,17 @@ typedef enum opts_type
   OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 53), // use dynamic shared memory (note: needs special kernel changes)
   OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 54), // some algos use JiT in combinations with a salt or create too much startup time
   OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 55), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
-  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 56), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
-  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 57), // disable else branch in pre-compilation thread count optimization setting
-  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 58), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
+  OPTS_TYPE_THREAD_MULTI_DISABLE              // do not multiply the kernel-power with the thread count per device for super slow algos
+                              = (1ULL << 56),
+  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 57), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 58), // disable else branch in pre-compilation thread count optimization setting
+  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 59), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
   OPTS_TYPE_AUTODETECT_DISABLE
-                              = (1ULL << 59), // skip autodetect engine
-  OPTS_TYPE_STOCK_MODULE      = (1ULL << 60), // module included with hashcat default distribution
+                              = (1ULL << 60), // skip autodetect engine
+  OPTS_TYPE_STOCK_MODULE      = (1ULL << 61), // module included with hashcat default distribution
   OPTS_TYPE_MULTIHASH_DESPITE_ESALT
-                              = (1ULL << 61), // overrule multihash cracking check same salt but not same esalt
+                              = (1ULL << 62), // overrule multihash cracking check same salt but not same esalt
+  OPTS_TYPE_MAXIMUM_ACCEL     = (1ULL << 63)  // try to maximize kernel-accel during autotune
 
 } opts_type_t;
 
diff --git a/src/autotune.c b/src/autotune.c
index 87637b29d..065c0a217 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -43,7 +43,8 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   device_param->kernel_param.loop_cnt = kernel_loops; // not a bug, both need to be set
   device_param->kernel_param.il_cnt   = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
 
-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads);
 
   u32 kernel_power_try = hardware_power * kernel_accel;
 
@@ -133,7 +134,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     device_param->kernel_accel   = kernel_accel_min;
     device_param->kernel_loops   = kernel_loops_min;
     device_param->kernel_threads = kernel_threads_min;
-    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_min;
+    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_min);
     device_param->kernel_power   = device_param->hardware_power * kernel_accel_min;
   }
 
@@ -212,7 +214,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     // from here it's clear we are allowed to autotune
     // so let's init some fake words
 
-    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_max;
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_max);
 
     u32 kernel_power_max = hardware_power_max * kernel_accel_max;
 
@@ -298,13 +301,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     {
       const u32 kernel_threads_sav = device_param->kernel_threads;
 
-      device_param->kernel_threads = device_param->kernel_wgs1;
+      device_param->kernel_threads = MIN (device_param->kernel_wgs1, kernel_threads_max);
 
       run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0, true);
 
       if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
       {
-        device_param->kernel_threads = device_param->kernel_wgs2p;
+        device_param->kernel_threads = MIN (device_param->kernel_wgs2p, kernel_threads_max);
 
         run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0, true);
       }
@@ -330,8 +333,6 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     // v7 autotuner is a lot more straight forward
 
-    u32 kernel_loops_min_start = kernel_loops_min;
-
     if (hashes && hashes->st_salts_buf)
     {
       u32 start = kernel_loops_max;
@@ -348,12 +349,12 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
         {
-          kernel_loops_min_start = start;
+          kernel_loops = start;
         }
       }
     }
 
-    for (u32 kernel_loops_test = kernel_loops_min_start; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    for (u32 kernel_loops_test = kernel_loops; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
     {
       double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_test, kernel_threads_min, 2);
 
@@ -564,7 +565,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   device_param->kernel_loops   = kernel_loops;
   device_param->kernel_threads = kernel_threads;
 
-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads);
 
   device_param->hardware_power = hardware_power;
 
diff --git a/src/backend.c b/src/backend.c
index fc0051dd4..00cac2245 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2598,7 +2598,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = CEILDIV (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2636,14 +2639,29 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
+    u32 gridDimX = num_elements;
+    u32 gridDimY = 1;
+    u32 gridDimZ = 1;
+
+    u32 blockDimX = kernel_threads;
+    u32 blockDimY = 1;
+    u32 blockDimZ = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+
     if (is_autotune == true)
     {
-      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
     }
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
 
@@ -2699,7 +2717,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = CEILDIV (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2737,14 +2758,31 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
+    u32 gridDimX = num_elements;
+    u32 gridDimY = 1;
+    u32 gridDimZ = 1;
+
+    u32 blockDimX = kernel_threads;
+    u32 blockDimY = 1;
+    u32 blockDimZ = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+
+    //printf ("%d %d %d %d %d %d %d\n", kern_run, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ);
+
     if (is_autotune == true)
     {
-      if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
+      if (hc_hipLaunchKernel (hashcat_ctx, hip_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
     }
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event1, device_param->hip_stream) == -1) return -1;
 
-    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event2, device_param->hip_stream) == -1) return -1;
 
@@ -2984,17 +3022,44 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    }
+    else
+    {
+      num_elements = num_elements * kernel_threads;
+    }
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    size_t global_work_size[3] = { num_elements,   1, 1 };
+    size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    cl_uint work_dim = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
 
     if (is_autotune == true)
     {
-      if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
+      if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
     }
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
 
     // spin damper section
 
@@ -8952,7 +9017,8 @@ void backend_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
       device_param_dst->kernel_loops   = device_param_src->kernel_loops;
       device_param_dst->kernel_threads = device_param_src->kernel_threads;
 
-      const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param_dst->device_processors) * device_param_dst->kernel_threads;
+      const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param_dst->device_processors)
+                               * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param_dst->kernel_threads);
 
       device_param_dst->hardware_power = hardware_power;
 
@@ -9522,7 +9588,11 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
 
       hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
       hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-architecture=%s", device_param->gcnArchName);
-      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+
+      if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+      {
+        hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+      }
 
       // untested but it should work
       #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
@@ -10436,7 +10506,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      * device properties
      */
 
-    const u32 device_processors = device_param->device_processors;
+    //const u32 device_processors = device_param->device_processors;
 
     /**
      * device threads
@@ -16059,7 +16129,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     //    device_param->kernel_threads = kernel_threads;
     device_param->kernel_threads = 0;
 
-    u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * device_param->kernel_threads_max;
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads_max);
 
     u32 kernel_accel_min = device_param->kernel_accel_min;
     u32 kernel_accel_max = device_param->kernel_accel_max;
diff --git a/src/modules/module_01800.c b/src/modules/module_01800.c
index fea4851c6..f0d2e9623 100644
--- a/src/modules/module_01800.c
+++ b/src/modules/module_01800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "sha512crypt $6$, SHA512 (Unix)";
 static const u64   KERN_TYPE      = 1800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_06400.c b/src/modules/module_06400.c
index b7ac9d87f..a553846c7 100644
--- a/src/modules/module_06400.c
+++ b/src/modules/module_06400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "AIX {ssha256}";
 static const u64   KERN_TYPE      = 6400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_06800.c b/src/modules/module_06800.c
index 215f26992..e6cf96fd6 100644
--- a/src/modules/module_06800.c
+++ b/src/modules/module_06800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "LastPass + LastPass sniffed";
 static const u64   KERN_TYPE      = 6800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_07700.c b/src/modules/module_07700.c
index 2642a0bb7..1d783d351 100644
--- a/src/modules/module_07700.c
+++ b/src/modules/module_07700.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE)";
 static const u64   KERN_TYPE      = 7700;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_07701.c b/src/modules/module_07701.c
index bc852639b..232d7ca4d 100644
--- a/src/modules/module_07701.c
+++ b/src/modules/module_07701.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE) from RFC_READ_TABLE";
 static const u64   KERN_TYPE      = 7701;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_08300.c b/src/modules/module_08300.c
index c499667ae..80dd3475d 100644
--- a/src/modules/module_08300.c
+++ b/src/modules/module_08300.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_5;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "DNSSEC (NSEC3)";
 static const u64   KERN_TYPE      = 8300;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE
                                   | OPTS_TYPE_ST_HEX
diff --git a/src/modules/module_13751.c b/src/modules/module_13751.c
index dac31a633..ee27acfd6 100644
--- a/src/modules/module_13751.c
+++ b/src/modules/module_13751.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit (legacy)";
 static const u64   KERN_TYPE      = 13751;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13752.c b/src/modules/module_13752.c
index e959a80af..095758dd2 100644
--- a/src/modules/module_13752.c
+++ b/src/modules/module_13752.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit (legacy)";
 static const u64   KERN_TYPE      = 13752;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13753.c b/src/modules/module_13753.c
index 277afaf02..4eee98625 100644
--- a/src/modules/module_13753.c
+++ b/src/modules/module_13753.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit (legacy)";
 static const u64   KERN_TYPE      = 13753;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13761.c b/src/modules/module_13761.c
index e33a693e2..5b1b82b27 100644
--- a/src/modules/module_13761.c
+++ b/src/modules/module_13761.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13751;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13762.c b/src/modules/module_13762.c
index 0f5f18545..6f1a27929 100644
--- a/src/modules/module_13762.c
+++ b/src/modules/module_13762.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13752;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13763.c b/src/modules/module_13763.c
index c9e28d4c7..fb50002a2 100644
--- a/src/modules/module_13763.c
+++ b/src/modules/module_13763.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13753;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_14800.c b/src/modules/module_14800.c
index 9f3c1ca0f..1e4b91a15 100644
--- a/src/modules/module_14800.c
+++ b/src/modules/module_14800.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "iTunes backup >= 10.0";
 static const u64   KERN_TYPE      = 14800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_14900.c b/src/modules/module_14900.c
index cbe009024..fcc639688 100644
--- a/src/modules/module_14900.c
+++ b/src/modules/module_14900.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_CIPHER_KPA;
 static const char *HASH_NAME      = "Skip32 (PT = $salt, key = $pass)";
 static const u64   KERN_TYPE      = 14900;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_SUGGEST_KG;
diff --git a/src/modules/module_15600.c b/src/modules/module_15600.c
index 1f2b7b9a3..66469d9bf 100644
--- a/src/modules/module_15600.c
+++ b/src/modules/module_15600.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Ethereum Wallet, PBKDF2-HMAC-SHA256";
 static const u64   KERN_TYPE      = 15600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_16300.c b/src/modules/module_16300.c
index e7507f1fa..6a12a1b39 100644
--- a/src/modules/module_16300.c
+++ b/src/modules/module_16300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Ethereum Pre-Sale Wallet, PBKDF2-HMAC-SHA256";
 static const u64   KERN_TYPE      = 16300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_16700.c b/src/modules/module_16700.c
index 9dce98e62..2fec2f557 100644
--- a/src/modules/module_16700.c
+++ b/src/modules/module_16700.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "FileVault 2";
 static const u64   KERN_TYPE      = 16200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_16900.c b/src/modules/module_16900.c
index 443da6007..70eca3f16 100644
--- a/src/modules/module_16900.c
+++ b/src/modules/module_16900.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Ansible Vault";
 static const u64   KERN_TYPE      = 16900;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_18100.c b/src/modules/module_18100.c
index 46dadfe60..678cb8265 100644
--- a/src/modules/module_18100.c
+++ b/src/modules/module_18100.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OTP;
 static const char *HASH_NAME      = "TOTP (HMAC-SHA1)";
 static const u64   KERN_TYPE      = 18100;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE
diff --git a/src/modules/module_20800.c b/src/modules/module_20800.c
index 6cece615d..1330b99bc 100644
--- a/src/modules/module_20800.c
+++ b/src/modules/module_20800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "sha256(md5($pass))";
 static const u64   KERN_TYPE      = 20800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_21300.c b/src/modules/module_21300.c
index 75e57c1f4..34ebff7f5 100644
--- a/src/modules/module_21300.c
+++ b/src/modules/module_21300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "md5($salt.sha1($salt.$pass))";
 static const u64   KERN_TYPE      = 21300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_21500.c b/src/modules/module_21500.c
index dba580a7c..8936be11c 100644
--- a/src/modules/module_21500.c
+++ b/src/modules/module_21500.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SolarWinds Orion";
 static const u64   KERN_TYPE      = 21500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_USES_BITS_64
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_21700.c b/src/modules/module_21700.c
index 65536027b..e00cf232f 100644
--- a/src/modules/module_21700.c
+++ b/src/modules/module_21700.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Electrum Wallet (Salt-Type 4)";
 static const u64   KERN_TYPE      = 21700;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_22100.c b/src/modules/module_22100.c
index 344218243..4d79d30bf 100644
--- a/src/modules/module_22100.c
+++ b/src/modules/module_22100.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "BitLocker";
 static const u64   KERN_TYPE      = 22100;
-static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_MP_MULTI_DISABLE;
diff --git a/src/modules/module_22300.c b/src/modules/module_22300.c
index a85d1d753..04a4bc08c 100644
--- a/src/modules/module_22300.c
+++ b/src/modules/module_22300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "sha256($salt.$pass.$salt)";
 static const u64   KERN_TYPE      = 22300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_22921.c b/src/modules/module_22921.c
index 163aacee9..c6471a232 100644
--- a/src/modules/module_22921.c
+++ b/src/modules/module_22921.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "RSA/DSA/EC/OpenSSH Private Keys ($6$)";
 static const u64   KERN_TYPE      = 22921;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/modules/module_22941.c b/src/modules/module_22941.c
index 4b4bf09ac..65e7d7c2b 100644
--- a/src/modules/module_22941.c
+++ b/src/modules/module_22941.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "RSA/DSA/EC/OpenSSH Private Keys ($4$)";
 static const u64   KERN_TYPE      = 22941;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/modules/module_23400.c b/src/modules/module_23400.c
index 13743815f..c2cc2a2e2 100644
--- a/src/modules/module_23400.c
+++ b/src/modules/module_23400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Bitwarden";
 static const u64   KERN_TYPE      = 23400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_23600.c b/src/modules/module_23600.c
index fdb3467d1..0b8994ff7 100644
--- a/src/modules/module_23600.c
+++ b/src/modules/module_23600.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "AxCrypt 2 AES-256";
 static const u64   KERN_TYPE      = 23600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_23800.c b/src/modules/module_23800.c
index c6c8fec61..c75a9aaaf 100644
--- a/src/modules/module_23800.c
+++ b/src/modules/module_23800.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4; // actually only DGST_SIZE_4_
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "RAR3-p (Compressed)";
 static const u64   KERN_TYPE      = 23800;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_HOOK23
diff --git a/src/modules/module_24200.c b/src/modules/module_24200.c
index 8b6a8e5c8..20ea6ccec 100644
--- a/src/modules/module_24200.c
+++ b/src/modules/module_24200.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_DATABASE_SERVER;
 static const char *HASH_NAME      = "MongoDB ServerKey SCRAM-SHA-256";
 static const u64   KERN_TYPE      = 24200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_24420.c b/src/modules/module_24420.c
index 39820834a..544e07ee1 100644
--- a/src/modules/module_24420.c
+++ b/src/modules/module_24420.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "PKCS#8 Private Keys (PBKDF2-HMAC-SHA256 + 3DES/AES)";
 static const u64   KERN_TYPE      = 24420;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_25500.c b/src/modules/module_25500.c
index afc451fb9..06a1e795d 100644
--- a/src/modules/module_25500.c
+++ b/src/modules/module_25500.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Stargazer Stellar Wallet XLM";
 static const u64   KERN_TYPE      = 25500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_25900.c b/src/modules/module_25900.c
index 7ad951f6d..a460a3bd8 100644
--- a/src/modules/module_25900.c
+++ b/src/modules/module_25900.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "KNX IP Secure - Device Authentication Code";
 static const u64   KERN_TYPE      = 25900;
-static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_DEEP_COMP_KERNEL;
diff --git a/src/modules/module_26000.c b/src/modules/module_26000.c
index 35a6dd937..8ed90d5ee 100644
--- a/src/modules/module_26000.c
+++ b/src/modules/module_26000.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Mozilla key3.db";
 static const u64   KERN_TYPE      = 26000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE;
diff --git a/src/modules/module_26100.c b/src/modules/module_26100.c
index c42958289..3414fcbdb 100644
--- a/src/modules/module_26100.c
+++ b/src/modules/module_26100.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Mozilla key4.db";
 static const u64   KERN_TYPE      = 26100;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26600.c b/src/modules/module_26600.c
index 926555e94..92b5c8efc 100644
--- a/src/modules/module_26600.c
+++ b/src/modules/module_26600.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "MetaMask Wallet (needs all data, checks AES-GCM tag)";
 static const u64   KERN_TYPE      = 26600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26700.c b/src/modules/module_26700.c
index 63854c0d2..47427ba32 100644
--- a/src/modules/module_26700.c
+++ b/src/modules/module_26700.c
@@ -21,7 +21,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA224-128";
 static const u64   KERN_TYPE      = 26700;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26800.c b/src/modules/module_26800.c
index 024bfa636..44d72f7df 100644
--- a/src/modules/module_26800.c
+++ b/src/modules/module_26800.c
@@ -21,7 +21,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_6;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA256-192";
 static const u64   KERN_TYPE      = 26800;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26900.c b/src/modules/module_26900.c
index 8d916ebf2..29e3cdc92 100644
--- a/src/modules/module_26900.c
+++ b/src/modules/module_26900.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA384-256";
 static const u64   KERN_TYPE      = 26900;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
diff --git a/src/modules/module_27300.c b/src/modules/module_27300.c
index ff20acc9d..4d607051e 100644
--- a/src/modules/module_27300.c
+++ b/src/modules/module_27300.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA512-384";
 static const u64   KERN_TYPE      = 27300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
diff --git a/src/modules/module_27400.c b/src/modules/module_27400.c
index 7d9f0fbd4..059b50cb5 100644
--- a/src/modules/module_27400.c
+++ b/src/modules/module_27400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VMware VMX (PBKDF2-HMAC-SHA1 + AES-256-CBC)";
 static const u64   KERN_TYPE      = 27400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_27500.c b/src/modules/module_27500.c
index ebabe24e5..6b79eba3a 100644
--- a/src/modules/module_27500.c
+++ b/src/modules/module_27500.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VirtualBox (PBKDF2-HMAC-SHA256 & AES-128-XTS)";
 static const u64   KERN_TYPE      = 27500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_27600.c b/src/modules/module_27600.c
index b4cdade54..3259ce05f 100644
--- a/src/modules/module_27600.c
+++ b/src/modules/module_27600.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VirtualBox (PBKDF2-HMAC-SHA256 & AES-256-XTS)";
 static const u64   KERN_TYPE      = 27600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_29451.c b/src/modules/module_29451.c
index c8e10bdcf..fe3c0737f 100644
--- a/src/modules/module_29451.c
+++ b/src/modules/module_29451.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit";
 static const u64   KERN_TYPE      = 13751; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29452.c b/src/modules/module_29452.c
index 395839f64..842f36f47 100644
--- a/src/modules/module_29452.c
+++ b/src/modules/module_29452.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit";
 static const u64   KERN_TYPE      = 13752; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29453.c b/src/modules/module_29453.c
index 97bf9a333..6162337de 100644
--- a/src/modules/module_29453.c
+++ b/src/modules/module_29453.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit";
 static const u64   KERN_TYPE      = 13753; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29461.c b/src/modules/module_29461.c
index be7dcfc2e..ff2705da7 100644
--- a/src/modules/module_29461.c
+++ b/src/modules/module_29461.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit + boot-mode";
 static const u64   KERN_TYPE      = 13751; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29462.c b/src/modules/module_29462.c
index 7e9a6535c..14b53d6ad 100644
--- a/src/modules/module_29462.c
+++ b/src/modules/module_29462.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit + boot-mode";
 static const u64   KERN_TYPE      = 13752; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29463.c b/src/modules/module_29463.c
index e98c71289..7a5c8ac92 100644
--- a/src/modules/module_29463.c
+++ b/src/modules/module_29463.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit + boot-mode";
 static const u64   KERN_TYPE      = 13753; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_70000.c b/src/modules/module_70000.c
index 01504ea09..57bd77d12 100644
--- a/src/modules/module_70000.c
+++ b/src/modules/module_70000.c
@@ -29,7 +29,7 @@ static const u64   BRIDGE_TYPE    = BRIDGE_TYPE_MATCH_TUNINGS // optional - impr
                                   | BRIDGE_TYPE_REPLACE_LOOP;
 static const char *BRIDGE_NAME    = "argon2id_reference";
 static const char *ST_PASS        = "hashcat";
-static const char *ST_HASH        = "$argon2id$v=19$m=4096,t=3,p=1$FoIjFnZlM2JSJWYXUgMFAw$eYKMzhbW8uyT1LLtKRdRcJj2CQeRrdr2pKv/Y71YbAQ";
+static const char *ST_HASH        = "$argon2id$v=19$m=65536,t=3,p=1$FBMjI4RJBhIykCgol1KEJA$2ky5GAdhT1kH4kIgPN/oERE3Taiy43vNN70a3HpiKQU";
 
 u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
 u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
diff --git a/src/shared.c b/src/shared.c
index a00025352..f7dbcd78d 100644
--- a/src/shared.c
+++ b/src/shared.c
@@ -104,6 +104,9 @@ static const char *const OPTI_STR_USES_BITS_8          = "Uses-8-Bit";
 static const char *const OPTI_STR_USES_BITS_16         = "Uses-16-Bit";
 static const char *const OPTI_STR_USES_BITS_32         = "Uses-32-Bit";
 static const char *const OPTI_STR_USES_BITS_64         = "Uses-64-Bit";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_INIT  = "Slow-Hash-DimensionY-INIT";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_COMP  = "Slow-Hash-DimensionY-LOOP";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_LOOP  = "Slow-Hash-DimensionY-COMP";
 
 static const char *const HASH_CATEGORY_UNDEFINED_STR              = "Undefined";
 static const char *const HASH_CATEGORY_RAW_HASH_STR               = "Raw Hash";
@@ -1072,6 +1075,9 @@ const char *stroptitype (const u32 opti_type)
     case OPTI_TYPE_SLOW_HASH_SIMD_LOOP:  return OPTI_STR_SLOW_HASH_SIMD_LOOP;
     case OPTI_TYPE_SLOW_HASH_SIMD_LOOP2: return OPTI_STR_SLOW_HASH_SIMD_LOOP2;
     case OPTI_TYPE_SLOW_HASH_SIMD_COMP:  return OPTI_STR_SLOW_HASH_SIMD_COMP;
+    case OPTI_TYPE_SLOW_HASH_DIMY_INIT:  return OPTI_STR_SLOW_HASH_DIMY_INIT;
+    case OPTI_TYPE_SLOW_HASH_DIMY_LOOP:  return OPTI_STR_SLOW_HASH_DIMY_LOOP;
+    case OPTI_TYPE_SLOW_HASH_DIMY_COMP:  return OPTI_STR_SLOW_HASH_DIMY_COMP;
     case OPTI_TYPE_USES_BITS_8:          return OPTI_STR_USES_BITS_8;
     case OPTI_TYPE_USES_BITS_16:         return OPTI_STR_USES_BITS_16;
     case OPTI_TYPE_USES_BITS_32:         return OPTI_STR_USES_BITS_32;
diff --git a/tools/test_modules/m70000.pm b/tools/test_modules/m70000.pm
index 4b44334ac..72861ae49 100644
--- a/tools/test_modules/m70000.pm
+++ b/tools/test_modules/m70000.pm
@@ -17,7 +17,7 @@ sub module_generate_hash
 {
   my $word  = shift;
   my $salt  = shift;
-  my $m     = shift // 4096;
+  my $m     = shift // 65536;
   my $t     = shift // 3;
   my $p     = shift // 1;
   my $len   = shift // random_number (1, 2) * 16;

From 78c8180e12bcea14e227bd1b35237afaabea80b6 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 15:33:59 +0200
Subject: [PATCH 10/37] set static void to runtime-specific functions and
 simplify parameters

---
 src/backend.c | 67 ++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index 23baa8b25..9ffd6e2cc 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -5418,7 +5418,7 @@ void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (backend_ctx, 0, sizeof (backend_ctx_t));
 }
 
-int backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_nvml, MAYBE_UNUSED bool *need_nvapi)
+static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
   backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
   user_options_t    *user_options  = hashcat_ctx->user_options;
@@ -5767,10 +5767,10 @@ int backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualiz
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
       {
-        *need_nvml = true;
+        backend_ctx->need_nvml = true;
 
         #if defined (_WIN) || defined (__CYGWIN__)
-        *need_nvapi = true;
+        backend_ctx->need_nvapi = true;
         #endif
       }
 
@@ -5891,11 +5891,9 @@ int backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualiz
 
   backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
   backend_ctx->cuda_devices_active  = cuda_devices_active;
-
-  return 0;
 }
 
-int backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_adl, MAYBE_UNUSED bool *need_sysfs_amdgpu)
+static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
   #if defined (__linux__)
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
@@ -6262,10 +6260,10 @@ int backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualize
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD_USE_HIP) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
       {
-         *need_adl = true;
+         backend_ctx->need_adl = true;
 
          #if defined (__linux__)
-         *need_sysfs_amdgpu = true;
+         backend_ctx->need_sysfs_amdgpu = true;
          #endif
       }
 
@@ -6400,11 +6398,9 @@ int backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualize
 
   backend_ctx->hip_devices_cnt     = hip_devices_cnt;
   backend_ctx->hip_devices_active  = hip_devices_active;
-
-  return 0;
 }
 
-int backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED bool is_virtualized, MAYBE_UNUSED int virtmulti, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device, MAYBE_UNUSED bool *need_iokit)
+static void backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED bool is_virtualized, MAYBE_UNUSED int virtmulti, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
 {
   backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
 
@@ -6783,7 +6779,7 @@ int backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED boo
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_APPLE))
       {
-        *need_iokit = true;
+        backend_ctx->need_iokit = true;
       }
 
       // CPU burning loop damper
@@ -6835,11 +6831,9 @@ int backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED boo
 
   backend_ctx->metal_devices_cnt     = metal_devices_cnt;
   backend_ctx->metal_devices_active  = metal_devices_active;
-
-  return 0;
 }
 
-int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device, bool *need_nvml, MAYBE_UNUSED bool *need_nvapi, bool *need_adl, MAYBE_UNUSED bool *need_sysfs_amdgpu, MAYBE_UNUSED bool *need_sysfs_cpu, MAYBE_UNUSED bool *need_iokit)
+static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
   backend_ctx_t         *backend_ctx   = hashcat_ctx->backend_ctx;
@@ -7691,12 +7685,12 @@ int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtual
           #if defined (__APPLE__)
           if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
           {
-            *need_iokit = true;
+            backend_ctx->need_iokit = true;
           }
           #endif
 
           #if defined (__linux__)
-          *need_sysfs_cpu = true;
+          backend_ctx->need_sysfs_cpu = true;
           #endif
         }
 
@@ -7704,19 +7698,19 @@ int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtual
         {
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
-            *need_adl = true;
+            backend_ctx->need_adl = true;
 
             #if defined (__linux__)
-            *need_sysfs_amdgpu = true;
+            backend_ctx->need_sysfs_amdgpu = true;
             #endif
           }
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
           {
-            *need_nvml = true;
+            backend_ctx->need_nvml = true;
 
             #if defined (_WIN) || defined (__CYGWIN__)
-            *need_nvapi = true;
+            backend_ctx->need_nvapi = true;
             #endif
           }
 
@@ -7725,7 +7719,7 @@ int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtual
           {
             if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
             {
-              *need_iokit = true;
+              backend_ctx->need_iokit = true;
             }
           }
           #endif
@@ -8244,8 +8238,6 @@ int backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtual
 
   backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
   backend_ctx->opencl_devices_active  = opencl_devices_active;
-
-  return 0;
 }
 
 int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
@@ -8258,12 +8250,12 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   hc_device_param_t *devices_param = backend_ctx->devices_param;
 
-  bool need_adl           = false;
-  bool need_nvml          = false;
-  bool need_nvapi         = false;
-  bool need_sysfs_amdgpu  = false;
-  bool need_sysfs_cpu     = false;
-  bool need_iokit         = false;
+  backend_ctx->need_adl           = false;
+  backend_ctx->need_nvml          = false;
+  backend_ctx->need_nvapi         = false;
+  backend_ctx->need_sysfs_amdgpu  = false;
+  backend_ctx->need_sysfs_cpu     = false;
+  backend_ctx->need_iokit         = false;
 
   int bridge_link_device = 0; // this will only count active device
 
@@ -8278,19 +8270,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   // CUDA
 
-  backend_ctx_devices_init_cuda (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_nvml, &need_nvapi);
+  backend_ctx_devices_init_cuda (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // HIP
 
-  backend_ctx_devices_init_hip (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_adl, &need_sysfs_amdgpu);
+  backend_ctx_devices_init_hip (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // Metal
 
-  backend_ctx_devices_init_metal (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_iokit);
+  backend_ctx_devices_init_metal (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // OCL
 
-  backend_ctx_devices_init_opencl (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device, &need_nvml, &need_nvapi, &need_adl, &need_sysfs_amdgpu, &need_sysfs_cpu, &need_iokit);
+  backend_ctx_devices_init_opencl (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // all devices combined go into backend_* variables
 
@@ -8908,13 +8900,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
 
-  backend_ctx->need_adl           = need_adl;
-  backend_ctx->need_nvml          = need_nvml;
-  backend_ctx->need_nvapi         = need_nvapi;
-  backend_ctx->need_sysfs_amdgpu  = need_sysfs_amdgpu;
-  backend_ctx->need_sysfs_cpu     = need_sysfs_cpu;
-  backend_ctx->need_iokit         = need_iokit;
-
   backend_ctx->comptime = comptime;
 
   return 0;

From ca7111996863cbc0552c3a9e016cecfa20480f8f Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 16:59:39 +0200
Subject: [PATCH 11/37] Selftest: rename selftest function to process_selftest
 and splitting into 3 smaller functions

---
 docs/changes.txt |  1 +
 src/selftest.c   | 72 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index 7e15685c3..f4dc61531 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -141,6 +141,7 @@
 - Modules: Updated module_unstable_warning
 - Open Document Format: Added support for small documents with content length < 1024
 - OpenCL Backend: added workaround to set device_available_memory from CUDA/HIP alias device
+- Selftest: rename selftest function to process_selftest and splitting into 3 smaller functions
 - Status Code: Add specific return code for self-test fail (-11)
 - Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
 - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
diff --git a/src/selftest.c b/src/selftest.c
index f1b2b2dd5..aea6f3128 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -12,18 +12,14 @@
 #include "thread.h"
 #include "selftest.h"
 
-static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 *highest_pw_len)
 {
-  bridge_ctx_t         *bridge_ctx         = hashcat_ctx->bridge_ctx;
-  hashconfig_t         *hashconfig         = hashcat_ctx->hashconfig;
   hashes_t             *hashes             = hashcat_ctx->hashes;
   module_ctx_t         *module_ctx         = hashcat_ctx->module_ctx;
-  status_ctx_t         *status_ctx         = hashcat_ctx->status_ctx;
+  hashconfig_t         *hashconfig         = hashcat_ctx->hashconfig;
   user_options_t       *user_options       = hashcat_ctx->user_options;
   user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
-  if (hashconfig->st_hash == NULL) return 0;
-
   // init : replace hashes with selftest hash
 
   if (device_param->is_cuda == true)
@@ -86,8 +82,6 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   pw_t comb;
   bf_t bf;
 
-  u32 highest_pw_len = 0;
-
   if (user_options->slow_candidates == true)
   {
     if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -460,7 +454,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
             if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
           }
 
-          highest_pw_len = pw.pw_len;
+          *highest_pw_len = pw.pw_len;
         }
       }
     }
@@ -500,6 +494,16 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     }
   }
 
+  return 0;
+}
+
+static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 highest_pw_len)
+{
+  bridge_ctx_t *bridge_ctx = hashcat_ctx->bridge_ctx;
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+  hashes_t     *hashes     = hashcat_ctx->hashes;
+  module_ctx_t *module_ctx = hashcat_ctx->module_ctx;
+
   // main : run the kernel
 
   const u32 kernel_threads_sav = device_param->kernel_threads;
@@ -933,22 +937,28 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   device_param->kernel_threads = kernel_threads_sav;
 
-  // check : check if cracked
+  return 0;
+}
 
-  u32 num_cracked = 0;
+static int selftest_cleanup (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 *num_cracked)
+{
+  user_options_t       *user_options       = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
+
+  // check : check if cracked
 
   cl_event opencl_event;
 
   if (device_param->is_cuda == true)
   {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event3, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event3, device_param->hip_stream) == -1) return -1;
   }
@@ -956,13 +966,13 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   #if defined (__APPLE__)
   if (device_param->is_metal == true)
   {
-    if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, &num_cracked, device_param->metal_d_result, 0, sizeof (u32)) == -1) return -1;
+    if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, num_cracked, device_param->metal_d_result, 0, sizeof (u32)) == -1) return -1;
   }
   #endif
 
   if (device_param->is_opencl == true)
   {
-    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_FALSE, 0, sizeof (u32), &num_cracked, 0, NULL, &opencl_event) == -1) return -1;
+    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_FALSE, 0, sizeof (u32), num_cracked, 0, NULL, &opencl_event) == -1) return -1;
 
     if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
   }
@@ -1154,7 +1164,27 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
   }
 
+  return 0;
+}
+
+static int process_selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+{
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+  status_ctx_t *status_ctx = hashcat_ctx->status_ctx;
+
+  if (hashconfig->st_hash == NULL) return 0;
+
+  u32 highest_pw_len = 0;
+  u32 num_cracked = 0;
+
+  if (selftest_init (hashcat_ctx, device_param, &highest_pw_len) == -1) return -1;
+
+  if (selftest_run_kernel (hashcat_ctx, device_param, highest_pw_len) == -1) return -1;
+
+  if (selftest_cleanup (hashcat_ctx, device_param, &num_cracked) == -1) return -1;
+
   // check return
+
   if (num_cracked == 0)
   {
     hc_thread_mutex_lock (status_ctx->mux_display);
@@ -1169,20 +1199,22 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! HIP kernel self-test failed.", device_param->device_id + 1);
     }
 
-    #if defined (__APPLE__)
     if (device_param->is_metal == true)
     {
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! Metal kernel self-test failed.", device_param->device_id + 1);
     }
-    #endif
 
     if (device_param->is_opencl == true)
     {
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! OpenCL kernel self-test failed.", device_param->device_id + 1);
     }
 
-    event_log_warning (hashcat_ctx, "Your device driver installation is probably broken.");
-    event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+    if (device_param->is_metal == false)
+    {
+      event_log_warning (hashcat_ctx, "Your device driver installation is probably broken.");
+      event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+    }
+
     event_log_warning (hashcat_ctx, NULL);
 
     hc_thread_mutex_unlock (status_ctx->mux_display);
@@ -1232,7 +1264,7 @@ HC_API_CALL void *thread_selftest (void *p)
     if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
   }
 
-  const int rc_selftest = selftest (hashcat_ctx, device_param);
+  const int rc_selftest = process_selftest (hashcat_ctx, device_param);
 
   if (user_options->benchmark == true)
   {

From 907e58c27dd5c9c4474d97afd7430d24fb860f18 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 18:26:17 +0200
Subject: [PATCH 12/37] move is_virtualized and virtmulti into runtime-specific
 functions to simplify parameters

---
 src/backend.c | 66 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index 9ffd6e2cc..f65b2959e 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -5418,12 +5418,17 @@ void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (backend_ctx, 0, sizeof (backend_ctx_t));
 }
 
-static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
+static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
-  backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
-  user_options_t    *user_options  = hashcat_ctx->user_options;
+  const bridge_ctx_t   *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t  *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t *user_options  = hashcat_ctx->user_options;
 
-  hc_device_param_t *devices_param = backend_ctx->devices_param;
+  hc_device_param_t    *devices_param = backend_ctx->devices_param;
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
 
   int cuda_devices_cnt    = 0;
   int cuda_devices_active = 0;
@@ -5893,16 +5898,21 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, bool is_v
   backend_ctx->cuda_devices_active  = cuda_devices_active;
 }
 
-static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
+static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
   #if defined (__linux__)
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
   #endif
-  backend_ctx_t         *backend_ctx   = hashcat_ctx->backend_ctx;
-  user_options_t        *user_options  = hashcat_ctx->user_options;
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
 
   hc_device_param_t     *devices_param = backend_ctx->devices_param;
 
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
+
   int hip_devices_cnt    = 0;
   int hip_devices_active = 0;
 
@@ -6400,15 +6410,21 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, bool is_vi
   backend_ctx->hip_devices_active  = hip_devices_active;
 }
 
-static void backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED bool is_virtualized, MAYBE_UNUSED int virtmulti, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
+static void backend_ctx_devices_init_metal (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
 {
-  backend_ctx_t     *backend_ctx   = hashcat_ctx->backend_ctx;
-
   int metal_devices_cnt    = 0;
   int metal_devices_active = 0;
 
   #if defined (__APPLE__)
-  hc_device_param_t *devices_param = backend_ctx->devices_param;
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
 
   if (backend_ctx->mtl)
   {
@@ -6833,14 +6849,19 @@ static void backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UN
   backend_ctx->metal_devices_active  = metal_devices_active;
 }
 
-static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is_virtualized, int virtmulti, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
+static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
-  backend_ctx_t         *backend_ctx   = hashcat_ctx->backend_ctx;
-  user_options_t        *user_options  = hashcat_ctx->user_options;
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
 
   hc_device_param_t     *devices_param = backend_ctx->devices_param;
 
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
+
   int opencl_devices_cnt    = 0;
   int opencl_devices_active = 0;
 
@@ -8242,12 +8263,11 @@ static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, bool is
 
 int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 {
-  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
-        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
-        user_options_t  *user_options  = hashcat_ctx->user_options;
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   if (backend_ctx->enabled == false) return 0;
 
+  user_options_t    *user_options  = hashcat_ctx->user_options;
   hc_device_param_t *devices_param = backend_ctx->devices_param;
 
   backend_ctx->need_adl           = false;
@@ -8261,28 +8281,24 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   int backend_devices_idx = 0; // this will not only count active devices
 
-  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
-
-  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
-
   int virthost = -1;
   int virthost_finder = user_options->backend_devices_virthost;
 
   // CUDA
 
-  backend_ctx_devices_init_cuda (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+  backend_ctx_devices_init_cuda (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // HIP
 
-  backend_ctx_devices_init_hip (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+  backend_ctx_devices_init_hip (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // Metal
 
-  backend_ctx_devices_init_metal (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+  backend_ctx_devices_init_metal (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // OCL
 
-  backend_ctx_devices_init_opencl (hashcat_ctx, is_virtualized, virtmulti, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+  backend_ctx_devices_init_opencl (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // all devices combined go into backend_* variables
 

From 0830bc4b9ffe123696f80490c84bfef646676e2a Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 18:29:39 +0200
Subject: [PATCH 13/37] set backend_ctx before macro preprocessor, into
 backend_ctx_devices_init_metal

---
 src/backend.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index f65b2959e..1820359e1 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -6410,14 +6410,15 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
   backend_ctx->hip_devices_active  = hip_devices_active;
 }
 
-static void backend_ctx_devices_init_metal (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
+static void backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
 {
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
   int metal_devices_cnt    = 0;
   int metal_devices_active = 0;
 
   #if defined (__APPLE__)
   const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
-        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
         user_options_t  *user_options  = hashcat_ctx->user_options;
 
   hc_device_param_t     *devices_param = backend_ctx->devices_param;

From 161f00b9debd74977a0fa09eff8e035118eea3f2 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 19:07:33 +0200
Subject: [PATCH 14/37] Documents: Renamed status_code.txt in
 exit_status_code.txt and added device_status_code.txt

---
 docs/changes.txt                              |  1 +
 docs/device_status_code.txt                   | 20 +++++++++++++++++++
 ...{status_codes.txt => exit_status_code.txt} |  0
 3 files changed, 21 insertions(+)
 create mode 100644 docs/device_status_code.txt
 rename docs/{status_codes.txt => exit_status_code.txt} (100%)

diff --git a/docs/changes.txt b/docs/changes.txt
index f4dc61531..b36c1e3ff 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -136,6 +136,7 @@
 - Dependencies: Updated xxHash to 0.8.3 (commit 50f4226)
 - Building: Support building windows binaries on macOS using MinGW
 - Dependencies: Updated OpenCL-Headers to v2024.10.24 (commit 265df85)
+- Documents: Renamed status_code.txt in exit_status_code.txt and added device_status_code.txt
 - Documents: Updated BUILD.md and added BUILD_macOS.md (containing instructions for building windows binaries on macOS)
 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
 - Modules: Updated module_unstable_warning
diff --git a/docs/device_status_code.txt b/docs/device_status_code.txt
new file mode 100644
index 000000000..bd1607409
--- /dev/null
+++ b/docs/device_status_code.txt
@@ -0,0 +1,20 @@
+Device Status Codes:
+====================
+
+ 0 = "Initializing"
+ 1 = "Autotuning"
+ 2 = "Selftest"
+ 3 = "Running"
+ 4 = "Paused"
+ 5 = "Exhausted"
+ 6 = "Cracked"
+ 7 = "Aborted"
+ 8 = "Quit"
+ 9 = "Bypass"
+10 = "Aborted (Checkpoint)"
+11 = "Aborted (Runtime)"
+12 = "Running (Checkpoint Quit requested)"
+13 = "Error"
+14 = "Aborted (Finish)"
+15 = "Running (Quit after attack requested)"
+16 = "Autodetect"
diff --git a/docs/status_codes.txt b/docs/exit_status_code.txt
similarity index 100%
rename from docs/status_codes.txt
rename to docs/exit_status_code.txt

From 1978231c88a095dd1cf22d37fcdc74ee2aa43249 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 22:51:08 +0200
Subject: [PATCH 15/37] - Fixed bug in Hardware Monitor: prevent disable if ADL
 fail - Hardware Monitor: Splitting hwmon_ctx_init function into smaller
 library-specific functions

---
 docs/changes.txt |   2 +
 src/hwmon.c      | 481 +++++++++++++++++++++++++----------------------
 2 files changed, 259 insertions(+), 224 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index f4dc61531..4039eae51 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -92,6 +92,7 @@
 - Fixed bug in grep out-of-memory workaround on Unit Test
 - Fixed bug in input_tokenizer when TOKEN_ATTR_FIXED_LENGTH is used and refactor modules
 - Fixed bug in --stdout that caused certain rules to malfunction
+- Fixed bug in Hardware Monitor: prevent disable if ADL fail
 - Fixed build failed for 10700 optimized with Apple Metal
 - Fixed build failed for 13772 and 13773 with Apple Metal
 - Fixed build failed for 18400 with Apple Metal
@@ -155,6 +156,7 @@
 - Modules: Check UnpackSize to raise false positive with hc_decompress_rar
 - User Options: added --metal-compiler-runtime option
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
+- Hardware Monitor: Splitting hwmon_ctx_init function into smaller library-specific functions
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
 - Rules: Rename best64.rule to best66.rule and remove the unknown section from it
diff --git a/src/hwmon.c b/src/hwmon.c
index 4f5264b3d..e32b261fd 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -1268,142 +1268,10 @@ u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int ba
   return 0;
 }
 
-int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+static void hwmon_ctx_init_nvml (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvml, int backend_devices_cnt)
 {
-  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
-  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
-  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  hwmon_ctx->enabled = false;
-
-  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
-
-  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
-
-  //#if !defined (WITH_HWMON)
-  //return 0;
-  //#endif // WITH_HWMON
-
-  if (user_options->usage          > 0)     return 0;
-  //if (user_options->backend_info   > 0)     return 0;
-
-  if (user_options->hash_info     == true)  return 0;
-  if (user_options->keyspace      == true)  return 0;
-  if (user_options->left          == true)  return 0;
-  if (user_options->show          == true)  return 0;
-  if (user_options->stdout_flag   == true)  return 0;
-  if (user_options->version       == true)  return 0;
-  if (user_options->identify      == true)  return 0;
-  //we need hwmon support to get free memory per device support
-  //its a joke, but there's no way around
-  //if (user_options->hwmon         == false) return 0;
-
-  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  /**
-   * Initialize shared libraries
-   */
-
-  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  #define FREE_ADAPTERS                \
-  do {                                 \
-    hcfree (hm_adapters_adl);          \
-    hcfree (hm_adapters_nvapi);        \
-    hcfree (hm_adapters_nvml);         \
-    hcfree (hm_adapters_sysfs_amdgpu); \
-    hcfree (hm_adapters_sysfs_cpu);    \
-    hcfree (hm_adapters_iokit);        \
-  } while (0)
-
-  if (backend_ctx->need_nvml == true)
-  {
-    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
-
-    if (nvml_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvml);
-
-      hwmon_ctx->hm_nvml = NULL;
-    }
-  }
-
-  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
-  {
-    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
-
-    if (nvapi_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvapi);
-
-      hwmon_ctx->hm_nvapi = NULL;
-    }
-  }
-
-  if (backend_ctx->need_adl == true)
-  {
-    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
-
-    if (adl_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_adl);
-
-      hwmon_ctx->hm_adl = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_amdgpu == true)
-  {
-    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
-
-    if (sysfs_amdgpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-
-    // also if there's ADL, we don't need sysfs_amdgpu
-
-    if (hwmon_ctx->hm_adl)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_cpu == true)
-  {
-    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
-
-    if (sysfs_cpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_cpu);
-
-      hwmon_ctx->hm_sysfs_cpu = NULL;
-    }
-  }
-
-  #if defined(__APPLE__)
-  if (backend_ctx->need_iokit == true)
-  {
-    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
-
-    if (iokit_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_iokit);
-
-      hwmon_ctx->hm_iokit = NULL;
-    }
-  }
-  #endif
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_nvml)
   {
@@ -1485,6 +1353,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
       hcfree (nvmlGPUHandle);
     }
   }
+}
+
+static void hwmon_ctx_init_nvapi (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvapi, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_nvapi)
   {
@@ -1558,6 +1432,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
       hcfree (nvGPUHandle);
     }
   }
+}
+
+static int hwmon_ctx_init_adl (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_adl, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_adl)
   {
@@ -1567,23 +1447,13 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
       int tmp_in;
 
-      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1) return -1;
 
       // adapter info
 
       LPAdapterInfo lpAdapterInfo = (LPAdapterInfo) hccalloc (tmp_in, sizeof (AdapterInfo));
 
-      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1) return -1;
 
       for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
       {
@@ -1639,107 +1509,260 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
+  return 0;
+}
+
+static void hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_amdgpu, hm_attrs_t *hm_adapters_iokit, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+
   if (hwmon_ctx->hm_sysfs_amdgpu || hwmon_ctx->hm_iokit)
   {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
     {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
       {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+        // nothing to do
+      }
 
-        if (device_param->skipped == true) continue;
+      #if defined (__APPLE__)
+      if (device_param->is_metal == true)
+      {
+        const u32 device_id = device_param->device_id;
 
-        if (device_param->is_cuda == true)
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
         {
-          // nothing to do
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
+        }
+      }
+      #endif
+
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
+        {
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
         }
 
-        #if defined (__APPLE__)
-        if (device_param->is_metal == true)
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_amdgpu)
         {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-        }
-        #endif
-
-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
-        {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_amdgpu)
-          {
-            hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
-          }
+          hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
         }
       }
     }
   }
+}
+
+static void hwmon_ctx_init_sysfs_cpu (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_cpu, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_sysfs_cpu)
   {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
     {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
       {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-        if (device_param->skipped == true) continue;
-
-        if (device_param->is_cuda == true)
-        {
           // nothing to do
-        }
+      }
 
-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_cpu)
         {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_cpu)
-          {
-            hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
-            hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
-          }
+          hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
+          hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
         }
       }
     }
   }
+}
+
+int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+{
+  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
+  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  hwmon_ctx->enabled = false;
+
+  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
+
+  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
+
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON
+
+  if (user_options->usage          > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;
+
+  if (user_options->hash_info     == true)  return 0;
+  if (user_options->keyspace      == true)  return 0;
+  if (user_options->left          == true)  return 0;
+  if (user_options->show          == true)  return 0;
+  if (user_options->stdout_flag   == true)  return 0;
+  if (user_options->version       == true)  return 0;
+  if (user_options->identify      == true)  return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;
+
+  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  /**
+   * Initialize shared libraries
+   */
+
+  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  if (backend_ctx->need_nvml == true)
+  {
+    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
+
+    if (nvml_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvml);
+
+      hwmon_ctx->hm_nvml = NULL;
+    }
+  }
+
+  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
+  {
+    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
+
+    if (nvapi_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvapi);
+
+      hwmon_ctx->hm_nvapi = NULL;
+    }
+  }
+
+  if (backend_ctx->need_adl == true)
+  {
+    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
+
+    if (adl_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_adl);
+
+      hwmon_ctx->hm_adl = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_amdgpu == true)
+  {
+    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
+
+    if (sysfs_amdgpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+      hwmon_ctx->hm_sysfs_amdgpu = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_cpu == true)
+  {
+    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
+
+    if (sysfs_cpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_cpu);
+
+      hwmon_ctx->hm_sysfs_cpu = NULL;
+    }
+  }
+
+  #if defined(__APPLE__)
+  if (backend_ctx->need_iokit == true)
+  {
+    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
+
+    if (iokit_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_iokit);
+
+      hwmon_ctx->hm_iokit = NULL;
+    }
+  }
+  #endif
+
+  hwmon_ctx_init_nvml  (hashcat_ctx, hm_adapters_nvml,  backend_devices_cnt);
+
+  hwmon_ctx_init_nvapi (hashcat_ctx, hm_adapters_nvapi, backend_devices_cnt);
+
+  // if ADL init fail, disable
+
+  if (hwmon_ctx_init_adl (hashcat_ctx, hm_adapters_adl, backend_devices_cnt) == -1)
+  {
+    hcfree (hwmon_ctx->hm_adl);
+
+    hwmon_ctx->hm_adl = NULL;
+  }
+
+  // if there's ADL, we don't need sysfs_amdgpu
+
+  if (hwmon_ctx->hm_adl)
+  {
+    hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+    hwmon_ctx->hm_sysfs_amdgpu = NULL;
+  }
+
+  hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx, hm_adapters_sysfs_amdgpu, hm_adapters_iokit, backend_devices_cnt);
+
+  hwmon_ctx_init_sysfs_cpu (hashcat_ctx, hm_adapters_sysfs_cpu, backend_devices_cnt);
 
   #if defined(__APPLE__)
   if (backend_ctx->need_iokit == true)
@@ -1757,7 +1780,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (hwmon_ctx->hm_adl == NULL && hwmon_ctx->hm_nvml == NULL && hwmon_ctx->hm_sysfs_amdgpu == NULL && hwmon_ctx->hm_sysfs_cpu == NULL && hwmon_ctx->hm_iokit == NULL)
   {
-    FREE_ADAPTERS;
+    hcfree (hm_adapters_adl);
+    hcfree (hm_adapters_nvapi);
+    hcfree (hm_adapters_nvml);
+    hcfree (hm_adapters_sysfs_amdgpu);
+    hcfree (hm_adapters_sysfs_cpu);
+    hcfree (hm_adapters_iokit);
 
     return 0;
   }
@@ -1992,7 +2020,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
   }
 
-  FREE_ADAPTERS;
+  hcfree (hm_adapters_adl);
+  hcfree (hm_adapters_nvapi);
+  hcfree (hm_adapters_nvml);
+  hcfree (hm_adapters_sysfs_amdgpu);
+  hcfree (hm_adapters_sysfs_cpu);
+  hcfree (hm_adapters_iokit);
 
   return 0;
 }

From f848163b0ac594f43695a716167d75a798d7eddc Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 29 Jun 2025 23:08:43 +0200
Subject: [PATCH 16/37] Fixed race condition in selftest_init on OpenCL with
 non-blocking write

---
 docs/changes.txt |  1 +
 src/selftest.c   | 27 +++++++++++++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index f4dc61531..2d8b5007a 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -92,6 +92,7 @@
 - Fixed bug in grep out-of-memory workaround on Unit Test
 - Fixed bug in input_tokenizer when TOKEN_ATTR_FIXED_LENGTH is used and refactor modules
 - Fixed bug in --stdout that caused certain rules to malfunction
+- Fixed race condition in selftest_init on OpenCL with non-blocking write
 - Fixed build failed for 10700 optimized with Apple Metal
 - Fixed build failed for 13772 and 13773 with Apple Metal
 - Fixed build failed for 18400 with Apple Metal
diff --git a/src/selftest.c b/src/selftest.c
index aea6f3128..d8b105d2e 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -78,6 +78,8 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
     tmp.pw_len = (u32) tmp_len;
   }
 
+  cl_event opencl_event;
+
   pw_t pw;
   pw_t comb;
   bf_t bf;
@@ -118,7 +120,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
     if (device_param->is_opencl == true)
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
     }
   }
   else
@@ -163,7 +165,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
@@ -239,9 +241,9 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, &opencl_event) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
@@ -284,7 +286,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
         }
         else
@@ -345,7 +347,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_FALSE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_FALSE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, &opencl_event) == -1) return -1;
           }
 
           memset (&pw, 0, sizeof (pw));
@@ -451,7 +453,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
 
           *highest_pw_len = pw.pw_len;
@@ -489,11 +491,20 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
       }
     }
   }
 
+  // prevent race condition on OpenCL with non-blocking write
+
+  if (device_param->is_opencl == true)
+  {
+    if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
+
+    if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
+  }
+
   return 0;
 }
 

From f8df94f4571d557e50ca3a25e5e62111df18dcf4 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Mon, 30 Jun 2025 11:26:05 +0200
Subject: [PATCH 17/37] Switched all async and non-blocking calls to
 synchronous and blocking ones. Kept the original async bindings intact. This
 avoids race conditions like the one fixed in the previous commit, with no
 performance impact. Fixed a typedef issue for clEnqueueReadBuffer(). Updated
 Python/hcshared.py with missing entry for new salt_dimy attribute in salt_t
 struct. Fixed a bug in the autotuner when determining the starting value for
 kernel loops, in cases where the iteration count is N-1 and not a multiple of
 1024. Updated additional plugins to use OPTI_TYPE_REGISTER_LIMIT.

---
 Python/hcshared.py         |   3 +-
 include/ext_OpenCL.h       |   2 +-
 include/ext_cuda.h         |  27 +++-
 include/ext_hip.h          |  15 ++
 src/autotune.c             |   8 +-
 src/backend.c              | 280 ++++++++++++++++++-------------------
 src/ext_cuda.c             | 145 ++++++++++++++++++-
 src/ext_hip.c              | 143 ++++++++++++++++++-
 src/hashes.c               |  14 +-
 src/modules/module_01300.c |   1 +
 src/modules/module_07100.c |  11 +-
 src/modules/module_08200.c |   1 +
 src/selftest.c             |  90 ++++++------
 13 files changed, 523 insertions(+), 217 deletions(-)

diff --git a/Python/hcshared.py b/Python/hcshared.py
index 2fcf56c70..ad4390083 100644
--- a/Python/hcshared.py
+++ b/Python/hcshared.py
@@ -6,13 +6,14 @@ import sys
 
 def extract_salts(salts_buf) -> list:
   salts=[]
-  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I 8s I I I I I I I I", salts_buf):
+  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_dimy, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I I 8s I I I I I I I I", salts_buf):
     salt_buf = salt_buf[0:salt_len]
     salt_buf_pc = salt_buf_pc[0:salt_len_pc]
     salts.append({ "salt_buf":      salt_buf,     \
                    "salt_buf_pc":   salt_buf_pc,  \
                    "salt_iter":     salt_iter,    \
                    "salt_iter2":    salt_iter2,   \
+                   "salt_dimy":     salt_dimy,    \
                    "salt_sign":     salt_sign,    \
                    "salt_repeats":  salt_repeats, \
                    "orig_pos":      orig_pos,     \
diff --git a/include/ext_OpenCL.h b/include/ext_OpenCL.h
index d49f82c79..7044ac667 100644
--- a/include/ext_OpenCL.h
+++ b/include/ext_OpenCL.h
@@ -51,7 +51,7 @@ typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEFILLBUFFER)       (cl_comman
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUECOPYBUFFER)       (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
 typedef void *           (CL_API_CALL *OCL_CLENQUEUEMAPBUFFER)        (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUENDRANGEKERNEL)    (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEUNMAPMEMOBJECT)   (cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEWRITEBUFFER)      (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLFINISH)                  (cl_command_queue);
diff --git a/include/ext_cuda.h b/include/ext_cuda.h
index 44ae01cd6..a80af5367 100644
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@@ -1154,14 +1154,19 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUINIT)                   (unsigned int);
 typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL)           (CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC)               (CUdeviceptr *, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST)           (void **, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD)             (CUdeviceptr, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH)             (void *, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD)             (CUdeviceptr, const void *, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32)              (CUdeviceptr, unsigned int, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8)               (CUdeviceptr, unsigned char, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC)        (CUdeviceptr, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC)        (void *, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC)        (CUdeviceptr, const void *, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE)                (CUdeviceptr);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST)            (void *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO)             (size_t *, size_t *);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETFUNCTION)      (CUfunction *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETGLOBAL)        (CUdeviceptr *, size_t *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOAD)             (CUmodule *, const char *);
@@ -1217,14 +1222,19 @@ typedef struct hc_cuda_lib
   CUDA_CULAUNCHKERNEL           cuLaunchKernel;
   CUDA_CUMEMALLOC               cuMemAlloc;
   CUDA_CUMEMALLOCHOST           cuMemAllocHost;
+  CUDA_CUMEMCPYDTOD             cuMemcpyDtoD;
+  CUDA_CUMEMCPYDTOH             cuMemcpyDtoH;
+  CUDA_CUMEMCPYHTOD             cuMemcpyHtoD;
+  CUDA_CUMEMSETD32              cuMemsetD32;
+  CUDA_CUMEMSETD8               cuMemsetD8;
   CUDA_CUMEMCPYDTODASYNC        cuMemcpyDtoDAsync;
   CUDA_CUMEMCPYDTOHASYNC        cuMemcpyDtoHAsync;
   CUDA_CUMEMCPYHTODASYNC        cuMemcpyHtoDAsync;
+  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
+  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
   CUDA_CUMEMFREE                cuMemFree;
   CUDA_CUMEMFREEHOST            cuMemFreeHost;
   CUDA_CUMEMGETINFO             cuMemGetInfo;
-  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
-  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
   CUDA_CUMODULEGETFUNCTION      cuModuleGetFunction;
   CUDA_CUMODULEGETGLOBAL        cuModuleGetGlobal;
   CUDA_CUMODULELOAD             cuModuleLoad;
@@ -1272,13 +1282,18 @@ int hc_cuFuncSetAttribute      (void *hashcat_ctx, CUfunction hfunc, CUfunction_
 int hc_cuInit                  (void *hashcat_ctx, unsigned int Flags);
 int hc_cuLaunchKernel          (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc              (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyDtoH            (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyHtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+int hc_cuMemsetD32             (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N);
+int hc_cuMemsetD8              (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N);
 int hc_cuMemcpyDtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyHtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
-int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuMemsetD32Async        (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
 int hc_cuMemsetD8Async         (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuModuleGetFunction     (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleGetGlobal       (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx      (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
diff --git a/include/ext_hip.h b/include/ext_hip.h
index 4cc6fb797..d0f53d173 100644
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
@@ -588,6 +588,11 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPLAUNCHKERNEL)           (hipFunction_t,
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMALLOC)               (hipDeviceptr_t *, size_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMFREE)                (hipDeviceptr_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMGETINFO)             (size_t *, size_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOD)             (hipDeviceptr_t, hipDeviceptr_t, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOH)             (void *, hipDeviceptr_t, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYHTOD)             (hipDeviceptr_t, const void *, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMSETD32)              (hipDeviceptr_t, unsigned int, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMSETD8)               (hipDeviceptr_t, unsigned char, size_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTODASYNC)        (hipDeviceptr_t, hipDeviceptr_t, size_t, hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOHASYNC)        (void *, hipDeviceptr_t, size_t, hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYHTODASYNC)        (hipDeviceptr_t, const void *, size_t, hipStream_t);
@@ -633,6 +638,11 @@ typedef struct hc_hip_lib
   HIP_HIPMEMALLOC               hipMemAlloc;
   HIP_HIPMEMFREE                hipMemFree;
   HIP_HIPMEMGETINFO             hipMemGetInfo;
+  HIP_HIPMEMCPYDTOD             hipMemcpyDtoD;
+  HIP_HIPMEMCPYDTOH             hipMemcpyDtoH;
+  HIP_HIPMEMCPYHTOD             hipMemcpyHtoD;
+  HIP_HIPMEMSETD32              hipMemsetD32;
+  HIP_HIPMEMSETD8               hipMemsetD8;
   HIP_HIPMEMCPYDTODASYNC        hipMemcpyDtoDAsync;
   HIP_HIPMEMCPYDTOHASYNC        hipMemcpyDtoHAsync;
   HIP_HIPMEMCPYHTODASYNC        hipMemcpyHtoDAsync;
@@ -680,6 +690,11 @@ int hc_hipLaunchKernel         (void *hashcat_ctx, hipFunction_t f, unsigned int
 int hc_hipMemAlloc             (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
 int hc_hipMemFree              (void *hashcat_ctx, hipDeviceptr_t dptr);
 int hc_hipMemGetInfo           (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_hipMemcpyDtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyDtoH           (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyHtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+int hc_hipMemsetD32            (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
+int hc_hipMemsetD8             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
 int hc_hipMemcpyDtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
 int hc_hipMemcpyDtoHAsync      (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
 int hc_hipMemcpyHtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
diff --git a/src/autotune.c b/src/autotune.c
index 065c0a217..a599e65be 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -268,12 +268,12 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -344,8 +344,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
         start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
         start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
 
-        if ((hashes->st_salts_buf->salt_iter     % 125) == 0) start = MIN (start, 125);
-        if ((hashes->st_salts_buf->salt_iter + 1 % 125) == 0) start = MIN (start, 125);
+        if (((hashes->st_salts_buf->salt_iter + 0) % 125) == 0) start = MIN (start, 125);
+        if (((hashes->st_salts_buf->salt_iter + 1) % 125) == 0) start = MIN (start, 125);
 
         if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
         {
diff --git a/src/backend.c b/src/backend.c
index 00cac2245..2d34caee1 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -985,7 +985,7 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
   }
@@ -994,7 +994,7 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
   {
     if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
   }
@@ -1020,14 +1020,14 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
   {
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyDtoHAsync (hashcat_ctx, pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyDtoH (hashcat_ctx, pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
       if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyDtoHAsync (hashcat_ctx, pw->i, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyDtoH (hashcat_ctx, pw->i, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
       if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
     }
@@ -1072,7 +1072,7 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, dest, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t)), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, dest, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
@@ -1083,7 +1083,7 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
   {
     if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t)), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
 
@@ -1112,7 +1112,7 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, dest, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, dest, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
@@ -1123,7 +1123,7 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
   {
     if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
 
@@ -1199,12 +1199,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm, device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_tm_c, size_tm, device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_tm_c, size_tm) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -1309,12 +1309,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
     {
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, device_param->hip_d_pws_amp_buf, pws_cnt * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_pws_buf, device_param->hip_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -1373,14 +1373,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
           if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
           if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
         }
@@ -1432,12 +1432,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -1534,14 +1534,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1563,14 +1563,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1625,14 +1625,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1684,12 +1684,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
             }
 
             #if defined (__APPLE__)
@@ -1774,14 +1774,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1803,14 +1803,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -2059,12 +2059,12 @@ int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
   return 0;
 }
 
-int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u8 value, const u64 size)
+int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u8 value, const u64 size)
 {
-  return hc_cuMemsetD8Async (hashcat_ctx, buf + offset, value, size, device_param->cuda_stream);
+  return hc_cuMemsetD8 (hashcat_ctx, buf + offset, value, size);
 }
 
-int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u32 value, const u64 size)
+int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u32 value, const u64 size)
 {
   /* check that the size is multiple of element size */
   if (size % 4 != 0)
@@ -2072,7 +2072,7 @@ int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *dev
     return CUDA_ERROR_INVALID_VALUE;
   }
 
-  return hc_cuMemsetD32Async (hashcat_ctx, buf + offset, value, size / 4, device_param->cuda_stream);
+  return hc_cuMemsetD32 (hashcat_ctx, buf + offset, value, size / 4);
 }
 
 int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size)
@@ -2096,7 +2096,7 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
 
   if (num16m)
   {
-    if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), bzeros, num16m, device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), bzeros, num16m) == -1) return -1;
   }
 
   return 0;
@@ -2138,12 +2138,12 @@ int run_hip_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
   return 0;
 }
 
-int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u8  value, const u64 size)
+int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u8  value, const u64 size)
 {
-  return hc_hipMemsetD8Async (hashcat_ctx, buf + offset, value, size, device_param->hip_stream);
+  return hc_hipMemsetD8 (hashcat_ctx, buf + offset, value, size);
 }
 
-int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u32 value, const u64 size)
+int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u32 value, const u64 size)
 {
   /* check that the size is multiple of element size */
   if (size % 4 != 0)
@@ -2151,7 +2151,7 @@ int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
     return hipErrorInvalidValue;
   }
 
-  return hc_hipMemsetD32Async (hashcat_ctx, buf + offset, value, size / 4, device_param->hip_stream);
+  return hc_hipMemsetD32 (hashcat_ctx, buf + offset, value, size / 4);
 }
 
 int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 size)
@@ -2175,7 +2175,7 @@ int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
   if (num16m)
   {
-    if (hc_hipMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), bzeros, num16m, device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), bzeros, num16m) == -1) return -1;
   }
 
   return 0;
@@ -2466,7 +2466,7 @@ int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
     }
     else
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_FALSE, num16d * 16, num16m, bzeros, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, bzeros, 0, NULL, NULL) == -1) return -1;
     }
   }
 
@@ -2592,7 +2592,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;   break;
     }
 
-    if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params, device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params) == -1) return -1;
 
     if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
 
@@ -2711,7 +2711,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       case KERN_RUN_AUX4:   hip_function = device_param->hip_function_aux4;   break;
     }
 
-    if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params, device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params) == -1) return -1;
 
     //if (hc_hipFuncSetAttribute (hashcat_ctx, hip_function, HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
 
@@ -2982,7 +2982,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
     }
 
-    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_kernel_param, CL_FALSE, 0, device_param->size_kernel_params, &device_param->kernel_param, 0, NULL, NULL) == -1) return -1;
+    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_kernel_param, CL_TRUE, 0, device_param->size_kernel_params, &device_param->kernel_param, 0, NULL, NULL) == -1) return -1;
 
     /*
     for (u32 i = 24; i <= 34; i++)
@@ -3616,7 +3616,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
   {
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3624,13 +3624,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
       }
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3638,7 +3638,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
       }
     }
 
@@ -3660,7 +3660,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
     if (device_param->is_opencl == true)
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3668,7 +3668,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
       }
     }
 
@@ -3680,7 +3680,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
     {
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3688,13 +3688,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
         }
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3702,7 +3702,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
         }
       }
 
@@ -3724,7 +3724,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3732,7 +3732,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
         }
       }
 
@@ -3778,7 +3778,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3786,13 +3786,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
           }
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3800,7 +3800,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
           }
         }
 
@@ -3822,7 +3822,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3830,7 +3830,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
           }
         }
 
@@ -3842,7 +3842,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3850,13 +3850,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3864,7 +3864,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
@@ -3886,7 +3886,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3894,7 +3894,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
             }
           }
 
@@ -3904,7 +3904,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3912,13 +3912,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3926,7 +3926,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
@@ -3948,7 +3948,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3956,7 +3956,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
             }
           }
 
@@ -4190,12 +4190,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -4322,12 +4322,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4339,7 +4339,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_opencl == true)
               {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
               }
             }
             else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
@@ -4352,12 +4352,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4382,12 +4382,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4515,12 +4515,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4532,7 +4532,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_opencl == true)
               {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
               }
             }
             else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
@@ -4545,12 +4545,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4577,12 +4577,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_bfs, innerloop_left * sizeof (bf_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -11863,16 +11863,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param,   size_kernel_params)      == -1) return -1;
 
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests,            device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts,              device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
 
       /**
        * special buffers
@@ -11899,7 +11899,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
           }
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules_src, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules_src) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -11934,19 +11934,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_esalt_bufs, size_esalts) == -1) return -1;
 
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests, device_param->cuda_stream) == -1) return -1;
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts,   device_param->cuda_stream)   == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts)   == -1) return -1;
 
         if (size_esalts)
         {
           if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_esalts_buf, size_st_esalts) == -1) return -1;
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
         }
       }
     }
@@ -11974,16 +11974,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param,   size_kernel_params)      == -1) return -1;
 
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_digests_buf, hashes->digests_buf,     size_digests,            device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_salt_bufs,   hashes->salts_buf,       size_salts,              device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
 
       /**
        * special buffers
@@ -12010,7 +12010,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
           }
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_rules, straight_ctx->kernel_rules_buf, size_rules_src, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_rules, straight_ctx->kernel_rules_buf, size_rules_src) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -12045,19 +12045,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_esalt_bufs, size_esalts) == -1) return -1;
 
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_esalt_bufs, hashes->esalts_buf, size_esalts, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_digests_buf, hashes->st_digests_buf, size_st_digests, device_param->hip_stream) == -1) return -1;
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts,   device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts) == -1) return -1;
 
         if (size_esalts)
         {
           if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_esalts_buf, size_st_esalts) == -1) return -1;
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
         }
       }
     }
@@ -12195,15 +12195,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_kernel_params,      NULL, &device_param->opencl_d_kernel_param)   == -1) return -1;
 
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_FALSE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_salt_bufs,   CL_FALSE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL) == -1) return -1;
 
       /**
@@ -12221,7 +12221,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->opencl_d_rules)   == -1) return -1;
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_FALSE, 0, size_rules_src, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_TRUE, 0, size_rules_src, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -12244,7 +12244,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->opencl_d_esalt_bufs) == -1) return -1;
 
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_FALSE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
@@ -12256,7 +12256,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->opencl_d_st_esalts_buf) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_FALSE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
         }
       }
 
@@ -17537,14 +17537,14 @@ int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->cuda_stream)   == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->hip_stream)   == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -17558,7 +17558,7 @@ int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
     if (device_param->is_opencl == true)
     {
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_FALSE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_FALSE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE,  0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
 
       if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
     }
@@ -17593,14 +17593,14 @@ int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -17614,7 +17614,7 @@ int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_
     if (device_param->is_opencl == true)
     {
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_FALSE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_FALSE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE,  0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
 
       if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
     }
diff --git a/src/ext_cuda.c b/src/ext_cuda.c
index 4c0db9ffb..878977ade 100644
--- a/src/ext_cuda.c
+++ b/src/ext_cuda.c
@@ -87,14 +87,19 @@ int cuda_init (void *hashcat_ctx)
   HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD,             cuMemcpyDtoD_v2,           CUDA_CUMEMCPYDTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH,             cuMemcpyDtoH_v2,           CUDA_CUMEMCPYDTOH,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD,             cuMemcpyHtoD_v2,           CUDA_CUMEMCPYHTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32,              cuMemsetD32,               CUDA_CUMEMSETD32,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8,               cuMemsetD8,                CUDA_CUMEMSETD8,                CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync,        cuMemcpyDtoDAsync_v2,      CUDA_CUMEMCPYDTODASYNC,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync,        cuMemcpyDtoHAsync_v2,      CUDA_CUMEMCPYDTOHASYNC,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync,        cuMemcpyHtoDAsync_v2,      CUDA_CUMEMCPYHTODASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
@@ -517,6 +522,142 @@ int hc_cuMemFree (void *hashcat_ctx, CUdeviceptr dptr)
   return 0;
 }
 
+
+int hc_cuMemcpyDtoH (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoD (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyHtoD (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD32 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD32 (dstDevice, ui, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD8 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD8 (dstDevice, uc, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_cuMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/ext_hip.c b/src/ext_hip.c
index decfa987c..4a26a7e9f 100644
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@@ -140,12 +140,16 @@ int hip_init (void *hashcat_ctx)
   HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,              HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,              HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,              HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,               HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                HIP_HIPMEMSETD8,                HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,         HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,         HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,          HIP_HIPMEMSETD32ASYNC,          HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,           HIP_HIPMEMSETD8ASYNC,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
   HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
@@ -800,6 +804,143 @@ int hc_hipMemGetInfo (void *hashcat_ctx, size_t *free, size_t *total)
   return 0;
 }
 
+
+
+int hc_hipMemcpyDtoH (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyDtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyHtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD32 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD32 (dstDevice, ui, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD8 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD8 (dstDevice, uc, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/hashes.c b/src/hashes.c
index 78a6d5b90..72ab14433 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -334,7 +334,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_cuda == true)
     {
-      rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->cuda_stream);
+      rc = hc_cuMemcpyDtoH (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
 
       if (rc == 0)
       {
@@ -351,7 +351,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_hip == true)
     {
-      rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->hip_stream);
+      rc = hc_hipMemcpyDtoH (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
 
       if (rc == 0)
       {
@@ -382,7 +382,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_opencl == true)
     {
-      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_FALSE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
+      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
 
       if (rc == 0)
       {
@@ -587,14 +587,14 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_cuda == true)
   {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
   }
@@ -624,7 +624,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_cuda == true)
   {
-    rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->cuda_stream);
+    rc = hc_cuMemcpyDtoH (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t));
 
     if (rc == 0)
     {
@@ -641,7 +641,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_hip == true)
   {
-    rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->hip_stream);
+    rc = hc_hipMemcpyDtoH (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t));
 
     if (rc == 0)
     {
diff --git a/src/modules/module_01300.c b/src/modules/module_01300.c
index b50a0e99c..c79a3493c 100644
--- a/src/modules/module_01300.c
+++ b/src/modules/module_01300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH;
 static const char *HASH_NAME      = "SHA2-224";
 static const u64   KERN_TYPE      = 1300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_07100.c b/src/modules/module_07100.c
index ec3924cec..78c8f8081 100644
--- a/src/modules/module_07100.c
+++ b/src/modules/module_07100.c
@@ -85,15 +85,6 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  char *jit_build_options = NULL;
-
-  hc_asprintf (&jit_build_options, "-D NO_UNROLL");
-
-  return jit_build_options;
-}
-
 int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
 {
   u64 *digest = (u64 *) digest_buf;
@@ -398,7 +389,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_hook23                   = MODULE_DEFAULT;
   module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
   module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
diff --git a/src/modules/module_08200.c b/src/modules/module_08200.c
index 54d5c62b5..90049b1ed 100644
--- a/src/modules/module_08200.c
+++ b/src/modules/module_08200.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "1Password, cloudkeychain";
 static const u64   KERN_TYPE      = 8200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/selftest.c b/src/selftest.c
index d8b105d2e..5e744f88b 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -103,12 +103,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -120,7 +120,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
     if (device_param->is_opencl == true)
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
     }
   }
   else
@@ -148,12 +148,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -165,7 +165,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
@@ -218,16 +218,16 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, &comb, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, &comb, 1 * sizeof (pw_t)) == -1) return -1;
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, &comb, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, &comb, 1 * sizeof (pw_t)) == -1) return -1;
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -241,9 +241,9 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, &opencl_event) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, &opencl_event) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
@@ -269,12 +269,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -286,7 +286,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
         }
         else
@@ -330,12 +330,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, &bf, 1 * sizeof (bf_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bfs_c, &bf, 1 * sizeof (bf_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, &bf, 1 * sizeof (bf_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bfs_c, &bf, 1 * sizeof (bf_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -347,7 +347,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_FALSE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, &opencl_event) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_TRUE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, &opencl_event) == -1) return -1;
           }
 
           memset (&pw, 0, sizeof (pw));
@@ -436,12 +436,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -453,7 +453,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
 
           *highest_pw_len = pw.pw_len;
@@ -474,12 +474,12 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -491,7 +491,7 @@ static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
       }
     }
   }
@@ -587,14 +587,14 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks) == -1) return -1;
 
         if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks) == -1) return -1;
 
         if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
       }
@@ -616,12 +616,12 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -633,7 +633,7 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_FALSE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
       }
     }
 
@@ -683,14 +683,14 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -724,14 +724,14 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -758,14 +758,14 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks) == -1) return -1;
 
           if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks) == -1) return -1;
 
           if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
         }
@@ -787,12 +787,12 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -804,7 +804,7 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_FALSE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
         }
       }
     }
@@ -846,14 +846,14 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
             {
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
                 if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
                 if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
               }
@@ -887,12 +887,12 @@ static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *d
             {
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -962,14 +962,14 @@ static int selftest_cleanup (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
 
   if (device_param->is_cuda == true)
   {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, num_cracked, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event3, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, num_cracked, device_param->hip_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event3, device_param->hip_stream) == -1) return -1;
   }
@@ -983,7 +983,7 @@ static int selftest_cleanup (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
 
   if (device_param->is_opencl == true)
   {
-    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_FALSE, 0, sizeof (u32), num_cracked, 0, NULL, &opencl_event) == -1) return -1;
+    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), num_cracked, 0, NULL, &opencl_event) == -1) return -1;
 
     if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
   }

From 696fa3b2ad741f60e71589954cce9091ae6c1736 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Mon, 30 Jun 2025 19:38:54 +0200
Subject: [PATCH 18/37] Modified the automatic kernel-accel count reduction
 routine to also reduce kernel-thread count if insufficient device or host
 memory is available. Reduced the fixed memory reservation size from 1GiB to
 64MiB as a result. Added a warning when the user sets a thread count on the
 command line higher than recommended by the runtime (based on available
 registers and shared memory). Added host-side logic to detect true funnel
 shift support and disable kernels using it if not supported on the device.
 Updated more plugins to limit register count to 128 on NVIDIA GPUs.

---
 OpenCL/inc_vendor.h        |  13 +++-
 include/types.h            |   1 +
 src/backend.c              | 119 +++++++++++++++++++++++--------------
 src/modules/module_00020.c |   1 +
 src/modules/module_00021.c |   1 +
 src/modules/module_00022.c |   1 +
 src/modules/module_00023.c |   1 +
 src/modules/module_00024.c |   1 +
 src/modules/module_01500.c |   3 +-
 9 files changed, 95 insertions(+), 46 deletions(-)

diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h
index a52b4c899..a0df70d23 100644
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@@ -183,13 +183,11 @@ using namespace metal;
 #ifdef IS_CUDA
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif
 
 #ifdef IS_HIP
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif
 
 #ifdef IS_ROCM
@@ -220,7 +218,18 @@ using namespace metal;
 #define s3 w
 #endif
 
+// funnelshift is nv's bitalign starting from sm 70
+
+#ifdef IS_NV
+#if HAS_SHFW == 1
+#define USE_FUNNELSHIFT
+#endif
+#elif IS_HIP
+#define USE_FUNNELSHIFT
+#endif
+
 // some algorithms do not like this, eg 150, 1100, ...
+
 #ifdef NO_FUNNELSHIFT
 #undef USE_FUNNELSHIFT
 #endif
diff --git a/include/types.h b/include/types.h
index 600e7bd93..896ac9c42 100644
--- a/include/types.h
+++ b/include/types.h
@@ -1516,6 +1516,7 @@ typedef struct hc_device_param
   bool    has_lop3;
   bool    has_mov64;
   bool    has_prmt;
+  bool    has_shfw;
 
   double  spin_damp;
 
diff --git a/src/backend.c b/src/backend.c
index 2d34caee1..a9318be79 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -402,7 +402,8 @@ static bool is_same_device_type (const hc_device_param_t *src, const hc_device_p
   }
 
   if (src->device_processors         != dst->device_processors)         return false;
-  if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
+  // clocks can be different, but clocks should have no impact on workload tuning
+  // if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
   if (src->device_maxworkgroup_size  != dst->device_maxworkgroup_size)  return false;
 
   // memory size can be different, depending on which gpu has a monitor connected
@@ -5920,6 +5921,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = (sm >= 50) ? true : false;
       device_param->has_mov64 = (sm >= 10) ? true : false;
       device_param->has_prmt  = (sm >= 20) ? true : false;
+      device_param->has_shfw  = (sm >= 70) ? true : false;
 
       // device_available_mem
 
@@ -6397,6 +6399,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
+      device_param->has_shfw  = false;
 
       // device_available_mem
 
@@ -6885,6 +6888,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
+      device_param->has_shfw  = false;
 
       // check if we need skip device
 
@@ -8033,6 +8037,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           device_param->has_lop3  = (sm >= 50) ? true : false;
           device_param->has_mov64 = (sm >= 10) ? true : false;
           device_param->has_prmt  = (sm >= 20) ? true : false;
+          device_param->has_shfw  = (sm >= 70) ? true : false;
         }
 
         // common driver check
@@ -8428,6 +8433,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
       device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
       device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+      device_param->has_shfw  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"shf.l.wrap.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                       \
 
     if (backend_devices_idx > 0)
     {
@@ -8443,6 +8449,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         device_param->has_lop3  = device_param_prev->has_lop3;
         device_param->has_mov64 = device_param_prev->has_mov64;
         device_param->has_prmt  = device_param_prev->has_prmt;
+        device_param->has_shfw  = device_param_prev->has_shfw;
       }
       else
       {
@@ -8717,6 +8724,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           device_param->has_lop3  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                    \
           device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
           device_param->has_prmt  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                       \
+          device_param->has_shfw  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"shf.l.wrap.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                 \
 
         if (backend_devices_idx > 0)
         {
@@ -8732,6 +8740,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
             device_param->has_lop3  = device_param_prev->has_lop3;
             device_param->has_mov64 = device_param_prev->has_mov64;
             device_param->has_prmt  = device_param_prev->has_prmt;
+            device_param->has_shfw  = device_param_prev->has_shfw;
           }
           else
           {
@@ -11000,9 +11009,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     // we don't have sm_* on vendors not NV but it doesn't matter
 
     #if defined (DEBUG)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D HAS_SHFW=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->has_shfw, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
     #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D HAS_SHFW=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->has_shfw, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
     #endif
 
     build_options_buf[build_options_len] = 0;
@@ -16071,6 +16080,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       threads_per_block = device_param->kernel_preferred_wgs_multiple;
     }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      if (threads_per_block < user_options->kernel_threads)
+      {
+        event_log_warning (hashcat_ctx, "* Device #%u: The requested thread size '%d' exceeds the recommended limit of the backend runtime '%d'.", device_id + 1, user_options->kernel_threads, threads_per_block);
+      }
+    }
+
     if ((threads_per_block >= device_param->kernel_threads_min) && (threads_per_block <= device_param->kernel_threads_max))
     {
       //printf ("auto thread max: %d\n", threads_per_block);
@@ -16126,11 +16143,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->overtune_unfriendly = true;
     }
 
-    //    device_param->kernel_threads = kernel_threads;
-    device_param->kernel_threads = 0;
 
-    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
-                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads_max);
+    device_param->kernel_threads = 0;
+    device_param->kernel_accel = 0;
+
+    u32 kernel_threads_min = device_param->kernel_threads_min;
+    u32 kernel_threads_max = device_param->kernel_threads_max;
 
     u32 kernel_accel_min = device_param->kernel_accel_min;
     u32 kernel_accel_max = device_param->kernel_accel_max;
@@ -16179,27 +16197,29 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (device_param->is_hip    == true) local_size_bytes = hip_query_max_local_size_bytes    (hashcat_ctx, device_param);
       if (device_param->is_opencl == true) local_size_bytes = opencl_query_max_local_size_bytes (hashcat_ctx, device_param);
       // metal todo
-
-      // use this parameter to tune down kernel_accel_max, because it has such a huge impact on memory requirement
-      // let's target a maximum use of memory of 8GiB so that there's some room left for other stuff
-
-      if (local_size_bytes)
-      {
-        const u64 SIZE_8GiB = 8ULL * 1024 * 1024 * 1024;
-
-        const u64 max_accel = SIZE_8GiB / (hardware_power_max * local_size_bytes);
-
-        kernel_accel_max = MIN (kernel_accel_max, max_accel);
-      }
     }
 
     const u64 size_device_extra1234 = size_extra_buffer1 + size_extra_buffer2 + size_extra_buffer3 + size_extra_buffer4;
 
-    const u64 size_device_extra = MAX ((1024 * 1024 * 1024), size_device_extra1234);
+    // Still not 100% sure about the 64MiB here
 
-    while (kernel_accel_max >= kernel_accel_min)
+    const u64 size_device_extra = MAX ((64ULL * 1024 * 1024), size_device_extra1234);
+
+    // we will first decrease accel and when reached that limit, we will decrease threads
+    // when we decrease limit this will restore accel_max
+
+    int memory_limit_hit = 0;
+
+    const u32 kernel_accel_max_sav = kernel_accel_max;
+
+    while ((kernel_accel_max >= kernel_accel_min) || (kernel_threads_max >= kernel_threads_min))
     {
-      const u64 kernel_power_max = hardware_power_max * kernel_accel_max;
+      const u64 device_processors = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors);
+      const u64 kernel_threads    = ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_max);
+
+      const u64 kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
+
+      // size_spilling
 
       const u64 size_spilling = kernel_power_max * local_size_bytes;
 
@@ -16246,21 +16266,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       // now check if all device-memory sizes which depend on the kernel_accel_max amplifier are within its boundaries
       // if not, decrease amplifier and try again
 
-      int memory_limit_hit = 0;
+      memory_limit_hit = 0;
 
       // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
       // let's add some extra space just to be sure.
       // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
       // let's see if we still need this now that we have low-level API to report free memory
 
-      u64 EXTRA_SPACE = 4096; //(1024ULL * 1024ULL) * kernel_accel_max;
-
-      //EXTRA_SPACE = MAX (EXTRA_SPACE, ( 256ULL * 1024ULL * 1024ULL));
-      //EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-      if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_pws   > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_tmps  > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_hooks > device_param->device_maxmem_alloc) memory_limit_hit = 1;
 
       // work around, for some reason apple opencl can't have buffers larger 2^31
       // typically runs into trap 6
@@ -16338,14 +16353,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_kernel_params
         + size_spilling;
 
-      if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1;
-
-      if (memory_limit_hit == 1)
-      {
-        kernel_accel_max--;
-
-        continue;
-      }
+      if (size_total > device_param->device_available_mem) memory_limit_hit = 1;
 
       const u64 size_host_extra = (512 * 1024 * 1024);
 
@@ -16361,11 +16369,29 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_pws_base
         + size_host_extra;
 
-      if ((size_total_host + EXTRA_SPACE) > accel_limit_host) memory_limit_hit = 1;
+      if (size_total_host > accel_limit_host) memory_limit_hit = 1;
+
+      //printf ("%zu %zu %d %d\n", size_total, device_param->device_available_mem, kernel_accel_max, kernel_threads_max);
 
       if (memory_limit_hit == 1)
       {
-        kernel_accel_max--;
+        if (kernel_accel_max == kernel_accel_min)
+        {
+          if ((kernel_threads_max > kernel_threads_min) && (kernel_threads_max >= (device_param->kernel_preferred_wgs_multiple * 2)))
+          {
+            kernel_threads_max -= device_param->kernel_preferred_wgs_multiple;
+
+            kernel_accel_max = kernel_accel_max_sav;
+          }
+          else
+          {
+            break;
+          }
+        }
+        else
+        {
+          kernel_accel_max--;
+        }
 
         continue;
       }
@@ -16375,13 +16401,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       break;
     }
 
-    if (kernel_accel_max < kernel_accel_min)
+    if (memory_limit_hit == 1)
     {
       event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory or free host memory for mapping.", device_id + 1);
 
       backend_memory_hit_warnings++;
 
       device_param->skipped_warning = true;
+
       continue;
     }
 
@@ -16392,7 +16419,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     {
       while (kernel_accel_max > kernel_accel_min)
       {
-        const u64 kernel_power_max = hardware_power_max * kernel_accel_max;
+        const u64 kernel_power_max = device_param->device_processors * kernel_accel_max;
 
         if (kernel_power_max > hashes->salts_cnt)
         {
@@ -16405,6 +16432,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
+    device_param->kernel_threads_min = kernel_threads_min;
+    device_param->kernel_threads_max = kernel_threads_max;
+
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads_max);
+
     device_param->kernel_accel_min = kernel_accel_min;
     device_param->kernel_accel_max = kernel_accel_max;
 
diff --git a/src/modules/module_00020.c b/src/modules/module_00020.c
index 3d514be55..f20a23d75 100644
--- a/src/modules/module_00020.c
+++ b/src/modules/module_00020.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "md5($salt.$pass)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00021.c b/src/modules/module_00021.c
index a9072fb4e..ce19d37ab 100644
--- a/src/modules/module_00021.c
+++ b/src/modules/module_00021.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FORUM_SOFTWARE;
 static const char *HASH_NAME      = "osCommerce, xt:Commerce";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00022.c b/src/modules/module_00022.c
index 4252f2f6d..d5742c8f0 100644
--- a/src/modules/module_00022.c
+++ b/src/modules/module_00022.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "Juniper NetScreen/SSG (ScreenOS)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00023.c b/src/modules/module_00023.c
index b50bf7a84..d6b2f281a 100644
--- a/src/modules/module_00023.c
+++ b/src/modules/module_00023.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_IMS;
 static const char *HASH_NAME      = "Skype";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00024.c b/src/modules/module_00024.c
index a2c4d2e90..f3da3ba75 100644
--- a/src/modules/module_00024.c
+++ b/src/modules/module_00024.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_EAS;
 static const char *HASH_NAME      = "SolarWinds Serv-U";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c
index b66dddff1..286eda992 100644
--- a/src/modules/module_01500.c
+++ b/src/modules/module_01500.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4; // originally DGST_SIZE_4_2
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "descrypt, DES (Unix), Traditional DES";
 static const u64   KERN_TYPE      = 1500;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_TM_KERNEL

From 03ea85dc688f42734528c3d735d84e4f803dd541 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=97=84=ED=83=9C=EC=9A=A9?= <lg65438805@gmail.com>
Date: Tue, 1 Jul 2025 18:26:07 +0900
Subject: [PATCH 19/37] [FIX] Prevent NULL dereference in read_restore() via
 hcmalloc

---
 src/restore.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/restore.c b/src/restore.c
index f2a59f7c1..2ca95ba92 100644
--- a/src/restore.c
+++ b/src/restore.c
@@ -89,6 +89,14 @@ static int read_restore (hashcat_ctx_t *hashcat_ctx)
 
   char *buf = (char *) hcmalloc (HCBUFSIZ_LARGE);
 
+  if (buf == NULL)
+  {
+    event_log_error(hashcat_ctx, "hcmalloc: %s", strerror(errno));
+    hc_fclose(&fp);
+    hcfree(rd->argv);
+    return -1;
+  }
+
   for (u32 i = 0; i < rd->argc; i++)
   {
     if (hc_fgets (buf, HCBUFSIZ_LARGE - 1, &fp) == NULL)

From 4d4bb71d48b5c90bf36659dfa998c095bebfb5da Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Tue, 1 Jul 2025 16:02:40 +0200
Subject: [PATCH 20/37] Use a combination of device_processor == 1 and
 CL_DEVICE_HOST_UNIFIED_MEMORY == 1 to determine if the device is an APU. In
 that case, overwrite the device_processor count with 64 to correct the
 invalid value of 1 reported by the HIP and OpenCL runtimes. The value 64 is
 obtained from rocminfo. This might not be accurate for other APUs beyond the
 one used as a reference, but oversubscribing an APU does not negatively
 affect performance, so this should be acceptable. Also fixed a syntax error
 in inc_vendor.h related to funnel shift.

---
 OpenCL/inc_vendor.h |  6 ------
 src/backend.c       | 50 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h
index a0df70d23..40414cbb2 100644
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@@ -218,15 +218,9 @@ using namespace metal;
 #define s3 w
 #endif
 
-// funnelshift is nv's bitalign starting from sm 70
-
-#ifdef IS_NV
 #if HAS_SHFW == 1
 #define USE_FUNNELSHIFT
 #endif
-#elif IS_HIP
-#define USE_FUNNELSHIFT
-#endif
 
 // some algorithms do not like this, eg 150, 1100, ...
 
diff --git a/src/backend.c b/src/backend.c
index a9318be79..8213f4ea5 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -6094,6 +6094,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       device_param->device_processors = device_processors;
 
+      if ((device_param->device_processors == 1) && (device_param->device_host_unified_memory == 1))
+      {
+        // APUs return some weird numbers. These values seem more appropriate (from rocminfo)
+        //Compute Unit:            2
+        //SIMDs per CU:            2
+        //Wavefront Size:          32(0x20)
+        //Max Waves Per CU:        32(0x20)
+
+        device_param->device_processors = 2 * 32;
+      }
+
       // device_global_mem, device_maxmem_alloc, device_available_mem
 
       size_t bytes = 0;
@@ -6399,7 +6410,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
-      device_param->has_shfw  = false;
+      device_param->has_shfw  = prop.arch.hasFunnelShift;
 
       // device_available_mem
 
@@ -7218,6 +7229,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->opencl_device_c_version = opencl_device_c_version;
 
+        // device_host_unified_memory
+
+        cl_bool device_host_unified_memory = false;
+
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof (device_host_unified_memory), &device_host_unified_memory, NULL) == -1)
+        {
+          device_param->skipped = true;
+
+          continue;
+        }
+
+        device_param->device_host_unified_memory = (device_host_unified_memory == CL_TRUE) ? 1 : 0;
+
         // max_compute_units
 
         cl_uint device_processors = 0;
@@ -7231,6 +7255,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->device_processors = device_processors;
 
+        if ((device_param->device_processors == 1) && (device_param->device_host_unified_memory == 1))
+        {
+          // APUs return some weird numbers. These values seem more appropriate (from rocminfo)
+          //Compute Unit:            2
+          //SIMDs per CU:            2
+          //Wavefront Size:          32(0x20)
+          //Max Waves Per CU:        32(0x20)
+
+          device_param->device_processors = 2 * 32;
+        }
+
         #if defined (__APPLE__)
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
@@ -7258,19 +7293,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         }
         #endif // __APPLE__
 
-        // device_host_unified_memory
-
-        cl_bool device_host_unified_memory = false;
-
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof (device_host_unified_memory), &device_host_unified_memory, NULL) == -1)
-        {
-          device_param->skipped = true;
-
-          continue;
-        }
-
-        device_param->device_host_unified_memory = (device_host_unified_memory == CL_TRUE) ? 1 : 0;
-
         // device_global_mem
 
         cl_ulong device_global_mem = 0;

From 76869e5099273d83f7b3cc23b6ac734a8c840495 Mon Sep 17 00:00:00 2001
From: Royce Williams <royce@techsolvency.com>
Date: Tue, 1 Jul 2025 18:31:20 -0800
Subject: [PATCH 21/37] skip preamble if --backend_info and --machine

---
 src/terminal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/terminal.c b/src/terminal.c
index 2976bc7c2..e929a6ab8 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -88,8 +88,11 @@ void welcome_screen (hashcat_ctx_t *hashcat_ctx, const char *version_tag)
   }
   else if (user_options->backend_info > 0)
   {
-    event_log_info (hashcat_ctx, "%s (%s) starting in backend information mode", PROGNAME, version_tag);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "%s (%s) starting in backend information mode", PROGNAME, version_tag);
+      event_log_info (hashcat_ctx, NULL);
+    }
   }
   else if (user_options->hash_mode_chgd == false)
   {

From 3c1649ccc8ddee97348c624fc0913682a77de692 Mon Sep 17 00:00:00 2001
From: Pelle Kuiters <p.barens@nfi.nl>
Date: Wed, 2 Jul 2025 10:47:00 +0200
Subject: [PATCH 22/37] GPU support for Argon2id for NVIDIA CUDA

---
 OpenCL/inc_common.cl       |  83 ++++++++
 OpenCL/inc_common.h        |   4 +
 OpenCL/inc_hash_argon2.cl  | 383 +++++++++++++++++++++++++++++++++++++
 OpenCL/inc_hash_argon2.h   |  84 ++++++++
 OpenCL/inc_platform.cl     |  37 +++-
 OpenCL/inc_platform.h      |   5 +-
 OpenCL/m34000-pure.cl      |  93 +++++++++
 src/autotune.c             |   2 +-
 src/backend.c              |  13 +-
 src/modules/module_34000.c | 363 +++++++++++++++++++++++++++++++++++
 10 files changed, 1055 insertions(+), 12 deletions(-)
 create mode 100644 OpenCL/inc_hash_argon2.cl
 create mode 100644 OpenCL/inc_hash_argon2.h
 create mode 100644 OpenCL/m34000-pure.cl
 create mode 100644 src/modules/module_34000.c

diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index a23efd42b..1bb07b896 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -1946,6 +1946,19 @@ DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
 
 #endif
 
+/**
+ * arithmetic operations
+ */
+
+DECLSPEC u32 hc_umulhi (const u32 x, const u32 y)
+{
+#if defined IS_CUDA
+  return __umulhi (x, y);
+#else
+  return h32_from_64_S ((u64) x * (u64) y);
+#endif
+}
+
 /**
  * pure scalar functions
  */
@@ -41979,6 +41992,76 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
       w0[1] = 0;
       w0[0] = 0;
       break;
+
+    case 16:
+      w7[3] = hc_byte_perm_S (w3[2], w3[3], selector);
+      w7[2] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[1] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[0] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w6[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w5[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w4[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+      break;
+
+    case 17:
+      w7[3] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[2] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[1] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[0] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+      break;
   }
   #endif
 }
diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index 0b51fbd29..c24ecb524 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -284,6 +284,10 @@ DECLSPEC u32  hc_bfe_S          (const u32  a, const u32  b, const u32  c);
 DECLSPEC u32x hc_lop_0x96       (const u32x a, const u32x b, const u32x c);
 DECLSPEC u32  hc_lop_0x96_S     (const u32  a, const u32  b, const u32  c);
 
+// arithmetic operations
+
+DECLSPEC u32  hc_umulhi (const u32 x, const u32 y);
+
 // legacy common code
 
 DECLSPEC int ffz (const u32 v);
diff --git a/OpenCL/inc_hash_argon2.cl b/OpenCL/inc_hash_argon2.cl
new file mode 100644
index 000000000..a82531a2c
--- /dev/null
+++ b/OpenCL/inc_hash_argon2.cl
@@ -0,0 +1,383 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ *
+ * Warp code based on original work by Ondrej Mosnáček
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_hash_blake2b.h"
+#include "inc_hash_argon2.h"
+
+DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
+{
+  blake2b_ctx_t ctx;
+
+  blake2b_init (&ctx);
+
+  u64 blake_buf[16] = { 0 };
+
+  blake_buf[0] = sizeof(argon2_block_t);
+
+  blake2b_update (&ctx, (u32 *) blake_buf, 4);
+  blake2b_update (&ctx, in, 64);
+
+  blake_buf[0] = hl32_to_64 (lane, blocknum);
+
+  blake2b_update (&ctx, (u32 *) blake_buf, 8);
+
+  blake2b_final (&ctx);
+
+  u64 *out = blocks[(blocknum * parallelism) + lane].values;
+
+  out[0] = ctx.h[0];
+  out[1] = ctx.h[1];
+  out[2] = ctx.h[2];
+  out[3] = ctx.h[3];
+
+  for (u32 off = 4; off < 124; off += 4)
+  {
+    for (u32 idx = 0; idx < 8; idx++) blake_buf[idx] = ctx.h[idx];
+
+    blake2b_init (&ctx);
+    blake2b_transform (ctx.h, blake_buf, 64, BLAKE2B_FINAL);
+
+    out[off + 0] = ctx.h[0];
+    out[off + 1] = ctx.h[1];
+    out[off + 2] = ctx.h[2];
+    out[off + 3] = ctx.h[3];
+  }
+
+  out[124] = ctx.h[4];
+  out[125] = ctx.h[5];
+  out[126] = ctx.h[6];
+  out[127] = ctx.h[7];
+}
+
+DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, u64 *blockhash)
+{
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  u32 option_input[32] = { 0 };
+
+  option_input[0] = options->parallelism;
+  option_input[1] = options->digest_len;
+  option_input[2] = options->memory_usage_in_kib;
+  option_input[3] = options->iterations;
+  option_input[4] = options->version;
+  option_input[5] = options->type;
+
+  blake2b_update (&ctx, option_input, 24);
+
+  u32 len_input[32] = { 0 };
+
+  len_input[0] = pw->pw_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, pw->i, pw->pw_len);
+
+  len_input[0] = salt->salt_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, salt->salt_buf, salt->salt_len);
+
+  len_input[0] = 0;
+
+  blake2b_update (&ctx, len_input, 4); // secret (K)
+  blake2b_update (&ctx, len_input, 4); // associated data (X)
+
+  blake2b_final (&ctx);
+
+  for (u32 idx = 0; idx < 8; idx++) blockhash[idx] = ctx.h[idx];
+}
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
+                           const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
+{
+  u64 blockhash[16] = { 0 };
+
+  argon2_initial_hash (pw, salt, options, blockhash);
+
+  // Generate the first two blocks of each lane
+  for (u32 lane = 0; lane < options->parallelism; lane++)
+  {
+    argon2_initial_block ((u32 *) blockhash, lane, 0, options->parallelism, out);
+    argon2_initial_block ((u32 *) blockhash, lane, 1, options->parallelism, out);
+  }
+}
+
+DECLSPEC u64 trunc_mul (u64 x, u64 y)
+{
+  const u32 xlo = (u32) x;
+  const u32 ylo = (u32) y;
+  return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
+}
+
+DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const argon2_pos_t *pos, u32 index, u64 pseudo_random)
+{
+  u32 ref_lane;
+  u32 ref_area;
+  u32 ref_index;
+
+  if ((pos->pass == 0) && (pos->slice == 0))
+  {
+    ref_lane = pos->lane;
+  }
+  else
+  {
+    ref_lane = h32_from_64_S (pseudo_random) % options->parallelism;
+  }
+
+  ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
+  ref_area *= options->segment_length;
+
+  if ((ref_lane == pos->lane) || (index == 0))
+  {
+      ref_area += (index - 1);
+  }
+
+  const u32 j1 = l32_from_64_S (pseudo_random);
+  ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
+
+  if (pos->pass > 0)
+  {
+    ref_index += (pos->slice + 1) * options->segment_length;
+
+    if (ref_index >= options->lane_length)
+    {
+      ref_index -= options->lane_length;
+    }
+  }
+
+  return (options->parallelism * ref_index) + ref_lane;
+}
+
+DECLSPEC void swap_u64 (u64 *x, u64 *y)
+{
+  u64 tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+DECLSPEC void transpose_permute_block (u64 R[4], int thread)
+{
+  if (thread & 0x08)
+  {
+    swap_u64 (&R[0], &R[2]);
+    swap_u64 (&R[1], &R[3]);
+  }
+  if (thread & 0x04)
+  {
+    swap_u64 (&R[0], &R[1]);
+    swap_u64 (&R[2], &R[3]);
+  }
+}
+
+DECLSPEC int argon2_shift (int idx, int thread)
+{
+  const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
+  return (thread & 0x0e) | (((thread & 0x11) + delta + 0x0e) & 0x11);
+}
+
+DECLSPEC void argon2_hash_block (u64 R[4], int thread)
+{
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  transpose_permute_block (R, thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx],  (thread & 0x1c) | ((thread + idx) & 0x03));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], ((thread & 0x1c) | ((thread - idx) & 0x03)) ^ (idx << 2));
+
+  transpose_permute_block (R, thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], argon2_shift (idx, thread));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], argon2_shift ((4 - idx), thread));
+}
+
+DECLSPEC void argon2_next_addresses (const argon2_options_t *options, const argon2_pos_t *pos, u32 *addresses, u32 start_index, u32 thread)
+{
+  u64 Z[4] = { 0 };
+  u64 tmp[4];
+
+  switch (thread)
+  {
+    case 0:  Z[0] = pos->pass;                   break;
+    case 1:  Z[0] = pos->lane;                   break;
+    case 2:  Z[0] = pos->slice;                  break;
+    case 3:  Z[0] = options->memory_block_count; break;
+    case 4:  Z[0] = options->iterations;         break;
+    case 5:  Z[0] = options->type;               break;
+    case 6:  Z[0] = (start_index / 128) + 1;     break;
+    default: Z[0] = 0;                           break;
+  }
+
+  tmp[0] = Z[0];
+
+  argon2_hash_block (Z, thread);
+
+  Z[0]  ^= tmp[0];
+
+  for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
+
+  argon2_hash_block (Z, thread);
+
+  for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
+
+  for (u32 i = 0, index = (start_index + thread); i < 4; i++, index += THREADS_PER_LANE)
+  {
+    addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
+  }
+}
+
+DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
+{
+  switch (index)
+  {
+    case 0:
+      return array[0];
+    case 1:
+      return array[1];
+    case 2:
+      return array[2];
+    case 3:
+      return array[3];
+  }
+}
+
+DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 thread)
+{
+  // Apply wrap-around to previous block index if the current block is the first block in the lane
+  const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
+
+  argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
+
+  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + thread];
+
+  return &blocks[(index_in_lane * options->parallelism) + lane];
+}
+
+DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
+                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 thread)
+{
+  for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
+  {
+    u32 ref_address;
+
+    if (indep_addr)
+    {
+      ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
+      ref_address = __shfl_sync (FULL_MASK, ref_address, index);
+    }
+    else
+    {
+      ref_address = argon2_ref_address (options, pos, index, R[0]);
+      ref_address = __shfl_sync (FULL_MASK, ref_address, 0);
+    }
+
+    GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
+
+    u64 tmp[4] = { 0 };
+
+    // First pass is overwrite, next passes are XOR with previous
+    if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
+    {
+      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + thread];
+    }
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + thread];
+
+    for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
+
+    argon2_hash_block (R, thread);
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
+
+    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + thread] = R[idx];
+  }
+}
+
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos)
+{
+  const u32  thread       = get_local_id(0);
+
+  // We have already generated the first two blocks of each lane (for the first pass)
+  const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
+  const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
+
+  u64 R[4];
+
+  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, thread);
+
+  if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
+  {
+    for (u32 block_index = 0; block_index < options->segment_length; block_index += 128)
+    {
+      const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
+      const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
+
+      u32 addresses[4];
+
+      argon2_next_addresses (options, pos, addresses, block_index, thread);
+      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, thread);
+
+      cur_block += (end_index - start_index) * options->parallelism;
+    }
+  }
+  else
+  {
+    u32 addresses[4] = { 0 };
+
+    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, thread);
+  }
+}
+
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out)
+{
+  const u32 lane_length = options->lane_length;
+  const u32 lanes = options->parallelism;
+
+  argon2_block_t final_block = { };
+
+  for (u32 l = 0; l < lanes; l++)
+  {
+    for (u32 idx = 0; idx < 128; idx++) final_block.values[idx] ^= blocks[((lane_length - 1) * lanes) + l].values[idx];
+  }
+
+  u32 output_len [32] = {0};
+  output_len [0] = options->digest_len;
+
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  // Override default (0x40) value in BLAKE2b
+  ctx.h[0] ^= 0x40 ^ options->digest_len; 
+
+  blake2b_update (&ctx, output_len, 4);
+  blake2b_update (&ctx, (u32 *) final_block.values, sizeof(final_block));
+
+  blake2b_final (&ctx);
+
+  for (int i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
+  {
+    out [i + 0] = l32_from_64_S (ctx.h[idx]);
+    out [i + 1] = h32_from_64_S (ctx.h[idx]);
+  }
+}
diff --git a/OpenCL/inc_hash_argon2.h b/OpenCL/inc_hash_argon2.h
new file mode 100644
index 000000000..8980068cd
--- /dev/null
+++ b/OpenCL/inc_hash_argon2.h
@@ -0,0 +1,84 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_ARGON2_H
+#define INC_HASH_ARGON2_H
+
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
+#define THREADS_PER_LANE 32
+#define FULL_MASK 0xffffffff
+
+#define BLAKE2B_OUTBYTES 64
+#define ARGON2_SYNC_POINTS 4
+#define ARGON2_ADDRESSES_IN_BLOCK 128
+
+#define TYPE_D  0
+#define TYPE_I  1
+#define TYPE_ID 2
+
+#define ARGON2_G(a,b,c,d)                \
+{                                        \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot32_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = blake2b_rot24_S (b ^ c);           \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot16_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = hc_rotr64_S (b ^ c, 63);           \
+}
+
+#define ARGON2_P()                       \
+{                                        \
+  ARGON2_G(v[0], v[4], v[8], v[12]);     \
+  ARGON2_G(v[1], v[5], v[9], v[13]);     \
+  ARGON2_G(v[2], v[6], v[10], v[14]);    \
+  ARGON2_G(v[3], v[7], v[11], v[15]);    \
+                                         \
+  ARGON2_G(v[0], v[5], v[10], v[15]);    \
+  ARGON2_G(v[1], v[6], v[11], v[12]);    \
+  ARGON2_G(v[2], v[7], v[8], v[13]);     \
+  ARGON2_G(v[3], v[4], v[9], v[14]);     \
+}
+
+typedef struct argon2_block
+{
+  u64 values[128];
+
+} argon2_block_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+  u32 digest_len;
+
+} argon2_options_t;
+
+typedef struct argon2_pos
+{
+  u32 pass;
+  u32 slice;
+  u32 lane;
+
+} argon2_pos_t;
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos);
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out);
+
+#endif // INC_HASH_ARGON2_H
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 9af10e9ba..36303296f 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -104,9 +104,17 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }  
 }
 
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@@ -114,15 +122,30 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
   return (blockIdx.x * blockDim.x) + threadIdx.x;
 }
 
-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
 }
 
-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }  
 }
 
 DECLSPEC u32x rotl32 (const u32x a, const int n)
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index cc6489afe..4e203ff18 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -27,8 +27,9 @@ DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
 DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_id    (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_size  (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
+DECLSPEC size_t get_local_id    (const u32 dimindx);
+DECLSPEC size_t get_local_size  (const u32 dimindx);
 
 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
diff --git a/OpenCL/m34000-pure.cl b/OpenCL/m34000-pure.cl
new file mode 100644
index 000000000..8c17a523d
--- /dev/null
+++ b/OpenCL/m34000-pure.cl
@@ -0,0 +1,93 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#include M2S(INCLUDE_PATH/inc_hash_argon2.cl)
+#endif
+
+#define COMPARE_S M2S(INCLUDE_PATH/inc_comp_single.cl)
+#define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl)
+
+typedef struct argon2_tmp
+{
+#ifndef ARGON2_TMP_ELEM
+#define ARGON2_TMP_ELEM 1
+#endif
+
+  argon2_block_t blocks[ARGON2_TMP_ELEM];
+
+} argon2_tmp_t;
+
+KERNEL_FQ void m34000_init (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  argon2_init (&pws[gid], &salt_bufs[SALT_POS_HOST], &options, tmps[gid].blocks);
+}
+
+KERNEL_FQ void m34000_loop (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_group_id (0);
+  const u64 lid = get_local_id (1);
+  const u64 lsz = get_local_size (1);
+
+  if (gid >= GID_CNT) return;
+
+  argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  options.parallelism = ARGON2_PARALLELISM;
+
+  argon2_pos_t pos;
+
+  pos.pass   = (LOOP_POS / ARGON2_SYNC_POINTS);
+  pos.slice  = (LOOP_POS % ARGON2_SYNC_POINTS);
+
+  for (u32 i = 0; i < LOOP_CNT; i++)
+  {
+    for (pos.lane = lid; pos.lane < options.parallelism; pos.lane += lsz)
+    {
+      argon2_fill_segment (tmps[gid].blocks, &options, &pos);
+    }
+
+    SYNC_THREADS ();
+
+    pos.slice++;
+
+    if (pos.slice == ARGON2_SYNC_POINTS)
+    {
+      pos.slice = 0;
+      pos.pass++;
+    }
+  }
+}
+
+KERNEL_FQ void m34000_comp ( _KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 out[8];
+
+  const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  argon2_final (tmps[gid].blocks, &options, out);
+
+  const u32 r0 = out[0];
+  const u32 r1 = out[1];
+  const u32 r2 = out[2];
+  const u32 r3 = out[3];
+
+  #define il_pos 0
+
+  #include COMPARE_M
+}
diff --git a/src/autotune.c b/src/autotune.c
index a599e65be..891baa472 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -107,7 +107,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   const double target_msec = backend_ctx->target_msec;
 
-  const u32 kernel_accel_min = device_param->kernel_accel_min;
+  const u32 kernel_accel_min =  (hashconfig->opts_type & OPTS_TYPE_MAXIMUM_ACCEL) ? device_param->kernel_accel_max : device_param->kernel_accel_min;
   const u32 kernel_accel_max = device_param->kernel_accel_max;
 
   const u32 kernel_loops_min = device_param->kernel_loops_min;
diff --git a/src/backend.c b/src/backend.c
index 8213f4ea5..79b89dd04 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2662,7 +2662,16 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    if ((kern_run == KERN_RUN_2) && (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP))
+    {
+      const u32 warp_size = device_param->kernel_preferred_wgs_multiple;
+
+      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num, 1, 1, warp_size, blockDimY, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    }
+    else
+    {
+      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    }
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
 
@@ -16225,7 +16234,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     // Still not 100% sure about the 64MiB here
 
-    const u64 size_device_extra = MAX ((64ULL * 1024 * 1024), size_device_extra1234);
+    const u64 size_device_extra = MAX ((1024 * 1024 * 1024), size_device_extra1234);
 
     // we will first decrease accel and when reached that limit, we will decrease threads
     // when we decrease limit this will restore accel_max
diff --git a/src/modules/module_34000.c b/src/modules/module_34000.c
new file mode 100644
index 000000000..64056b36f
--- /dev/null
+++ b/src/modules/module_34000.c
@@ -0,0 +1,363 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+
+#define ARGON2_SYNC_POINTS  4
+#define ARGON2_BLOCK_SIZE   1024
+
+static const u32   ATTACK_EXEC    = ATTACK_EXEC_OUTSIDE_KERNEL;
+static const u32   DGST_POS0      = 0;
+static const u32   DGST_POS1      = 1;
+static const u32   DGST_POS2      = 2;
+static const u32   DGST_POS3      = 3;
+static const u32   DGST_SIZE      = DGST_SIZE_4_8;
+static const u32   HASH_CATEGORY  = HASH_CATEGORY_GENERIC_KDF;
+static const char *HASH_NAME      = "Argon2ID";
+static const u64   KERN_TYPE      = 34000;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_SLOW_HASH_DIMY_LOOP;
+static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
+                                  | OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_NATIVE_THREADS
+                                  | OPTS_TYPE_MP_MULTI_DISABLE
+                                  | OPTS_TYPE_MAXIMUM_ACCEL;
+static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
+static const char *ST_PASS        = "hashcat";
+static const char *ST_HASH        = "$argon2id$v=19$m=65536,t=3,p=1$FBMjI4RJBhIykCgol1KEJA$2ky5GAdhT1kH4kIgPN/oERE3Taiy43vNN70a3HpiKQU";
+
+u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
+u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
+u32         module_dgst_pos1      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS1;       }
+u32         module_dgst_pos2      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS2;       }
+u32         module_dgst_pos3      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS3;       }
+u32         module_dgst_size      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_SIZE;       }
+u32         module_hash_category  (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_CATEGORY;   }
+const char *module_hash_name      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_NAME;       }
+u64         module_kern_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE;       }
+u32         module_opti_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE;       }
+u64         module_opts_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE;       }
+u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE;       }
+const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
+const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+
+  u32 digest_len;
+
+} argon2_options_t;
+
+static const char *SIGNATURE_ARGON2ID = "$argon2id$";
+
+u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 esalt_size = (const u64) sizeof (argon2_options_t);
+
+  return esalt_size;
+}
+
+u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = 1;
+
+  return kernel_threads_min;
+}
+
+u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_max = 1;
+
+  return kernel_threads_max;
+}
+
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 1;
+
+  return kernel_loops_min;
+}
+
+u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_max = 1;
+
+  return kernel_loops_max;
+}
+
+bool module_warmup_disable (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const bool warmup_disable = true;
+
+  return warmup_disable;
+}
+
+u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 tmp_size = 0; // we'll add some later
+
+  return tmp_size;
+}
+
+u64 module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
+{
+  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  const u32 memory_block_count = options[0].memory_block_count;
+
+  // we need to check that all hashes have the same memory requirement
+  for (u32 i = 1; i < hashes->salts_cnt; i++)
+  {
+    if (options[i].memory_block_count != memory_block_count) return (1ULL << 63) + i;
+  }
+
+  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
+
+  if (user_options->self_test == true)
+  {
+    argon2_options_t *st_options = (argon2_options_t *) hashes->st_esalts_buf;
+
+    if (st_options[0].memory_block_count != memory_block_count) return (1ULL << 62);
+  }
+
+  const u64 tmp_size = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  return tmp_size;
+}
+
+char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  char *jit_build_options = NULL;
+
+  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%" PRIu32 " -D ARGON2_TMP_ELEM=%" PRIu32, options[0].parallelism, options[0].memory_block_count);
+
+  return jit_build_options;
+}
+
+int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
+{
+  u32 *digest = (u32 *) digest_buf;
+
+  argon2_options_t *options  = (argon2_options_t *) esalt_buf;
+
+  hc_token_t token;
+
+  memset (&token, 0, sizeof (hc_token_t));
+
+  token.token_cnt  = 7;
+
+  token.signatures_cnt    = 1;
+  token.signatures_buf[0] = SIGNATURE_ARGON2ID;
+
+  token.len[0]     = 10;
+  token.attr[0]    = TOKEN_ATTR_FIXED_LENGTH
+                   | TOKEN_ATTR_VERIFY_SIGNATURE;
+
+  // version
+  token.len[1]     = 4;
+  token.sep[1]     = '$';
+  token.attr[1]    = TOKEN_ATTR_FIXED_LENGTH;
+
+  // memoryUsageInKib
+  token.len_min[2] = 3;
+  token.len_max[2] = 12;
+  token.sep[2]     = ',';
+  token.attr[2]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // iterations
+  token.len_min[3] = 3;
+  token.len_max[3] = 5;
+  token.sep[3]     = ',';
+  token.attr[3]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // parallelism
+  token.len_min[4] = 3;
+  token.len_max[4] = 5;
+  token.sep[4]     = '$';
+  token.attr[4]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // salt
+  token.len_min[5] = ((SALT_MIN * 8) / 6) + 0;
+  token.len_max[5] = ((SALT_MAX * 8) / 6) + 3;
+  token.sep[5]     = '$';
+  token.attr[5]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_BASE64A;
+
+  // target hash
+  token.len_min[6] = ((SALT_MIN * 8) / 6) + 0;
+  token.len_max[6] = ((SALT_MAX * 8) / 6) + 3;
+  token.sep[6]     = '$';
+  token.attr[6]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_BASE64A;
+
+  const int rc_tokenizer = input_tokenizer ((const u8 *) line_buf, line_len, &token);
+
+  if (rc_tokenizer != PARSER_OK) return (rc_tokenizer);
+
+  // argon2id config
+  const u8 *ver_pos = token.buf[1];
+  const u8 *mem_pos = token.buf[2];
+  const u8 *it_pos  = token.buf[3];
+  const u8 *par_pos = token.buf[4];
+
+  options->type                = 2; // Only support for Argon2id
+  options->version             = hc_strtoul ((const char *) ver_pos + 2, NULL, 10);
+  options->memory_usage_in_kib = hc_strtoul ((const char *) mem_pos + 2, NULL, 10);
+  options->iterations          = hc_strtoul ((const char *) it_pos  + 2, NULL, 10);
+  options->parallelism         = hc_strtoul ((const char *) par_pos + 2, NULL, 10);
+
+  if (options->version != 19 && options->version != 16) return (PARSER_HASH_VALUE);
+  if (options->memory_usage_in_kib < 1) return (PARSER_HASH_VALUE);
+  if (options->iterations < 1) return (PARSER_HASH_VALUE);
+  if (options->parallelism < 1 || options->parallelism > 32) return (PARSER_HASH_VALUE);
+
+  options->segment_length     = MAX (2, (options->memory_usage_in_kib / (ARGON2_SYNC_POINTS * options->parallelism)));
+  options->lane_length        = options->segment_length * ARGON2_SYNC_POINTS;
+  options->memory_block_count = options->lane_length * options->parallelism;
+
+  // salt
+  const int salt_len = token.len[5];
+  const u8 *salt_pos = token.buf[5];
+
+  salt->salt_iter = options->iterations * ARGON2_SYNC_POINTS;
+  salt->salt_dimy = options->parallelism;
+  salt->salt_len = base64_decode (base64_to_int, (const u8 *) salt_pos, salt_len, (u8 *) salt->salt_buf);
+
+  // digest/ target hash
+  const int digest_len = token.len[6];
+  const u8 *digest_pos = token.buf[6];
+
+  options->digest_len = base64_decode (base64_to_int, (const u8 *) digest_pos, digest_len, (u8 *) digest);
+
+  return (PARSER_OK);
+}
+
+int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const void *digest_buf, MAYBE_UNUSED const salt_t *salt, MAYBE_UNUSED const void *esalt_buf, MAYBE_UNUSED const void *hook_salt_buf, MAYBE_UNUSED const hashinfo_t *hash_info, char *line_buf, MAYBE_UNUSED const int line_size)
+{
+  u32 *digest = (u32 *) digest_buf;
+
+  argon2_options_t *options  = (argon2_options_t *) esalt_buf;
+
+  // salt
+  char base64_salt[512] = { 0 };
+  base64_encode (int_to_base64, (const u8 *) salt->salt_buf, salt->salt_len, (u8 *) base64_salt);
+
+  // digest
+  char base64_digest[512] = { 0 };
+  base64_encode (int_to_base64, (const u8 *) digest, options->digest_len, (u8 *) base64_digest);
+
+  // out
+  u8 *out_buf = (u8 *) line_buf;
+
+  const int out_len = snprintf ((char *) out_buf, line_size, "%sv=%d$m=%d,t=%d,p=%d$%s$%s",
+    SIGNATURE_ARGON2ID,
+    options->version,
+    options->memory_usage_in_kib,
+    options->iterations,
+    options->parallelism,
+    base64_salt,
+    base64_digest);
+
+  return out_len;
+}
+
+void module_init (module_ctx_t *module_ctx)
+{
+  module_ctx->module_context_size             = MODULE_CONTEXT_SIZE_CURRENT;
+  module_ctx->module_interface_version        = MODULE_INTERFACE_VERSION_CURRENT;
+
+  module_ctx->module_attack_exec              = module_attack_exec;
+  module_ctx->module_benchmark_esalt          = MODULE_DEFAULT;
+  module_ctx->module_benchmark_hook_salt      = MODULE_DEFAULT;
+  module_ctx->module_benchmark_mask           = MODULE_DEFAULT;
+  module_ctx->module_benchmark_charset        = MODULE_DEFAULT;
+  module_ctx->module_benchmark_salt           = MODULE_DEFAULT;
+  module_ctx->module_bridge_name              = MODULE_DEFAULT;
+  module_ctx->module_bridge_type              = MODULE_DEFAULT;
+  module_ctx->module_build_plain_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_deep_comp_kernel         = MODULE_DEFAULT;
+  module_ctx->module_deprecated_notice        = MODULE_DEFAULT;
+  module_ctx->module_dgst_pos0                = module_dgst_pos0;
+  module_ctx->module_dgst_pos1                = module_dgst_pos1;
+  module_ctx->module_dgst_pos2                = module_dgst_pos2;
+  module_ctx->module_dgst_pos3                = module_dgst_pos3;
+  module_ctx->module_dgst_size                = module_dgst_size;
+  module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
+  module_ctx->module_esalt_size               = module_esalt_size;
+  module_ctx->module_extra_buffer_size        = MODULE_DEFAULT;
+  module_ctx->module_extra_tmp_size           = module_extra_tmp_size;
+  module_ctx->module_extra_tuningdb_block     = MODULE_DEFAULT;
+  module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_save         = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_zero_hash    = MODULE_DEFAULT;
+  module_ctx->module_hash_decode              = module_hash_decode;
+  module_ctx->module_hash_encode_status       = MODULE_DEFAULT;
+  module_ctx->module_hash_encode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_encode              = module_hash_encode;
+  module_ctx->module_hash_init_selftest       = MODULE_DEFAULT;
+  module_ctx->module_hash_mode                = MODULE_DEFAULT;
+  module_ctx->module_hash_category            = module_hash_category;
+  module_ctx->module_hash_name                = module_hash_name;
+  module_ctx->module_hashes_count_min         = MODULE_DEFAULT;
+  module_ctx->module_hashes_count_max         = MODULE_DEFAULT;
+  module_ctx->module_hlfmt_disable            = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_size    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_init    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_term    = MODULE_DEFAULT;
+  module_ctx->module_hook12                   = MODULE_DEFAULT;
+  module_ctx->module_hook23                   = MODULE_DEFAULT;
+  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
+  module_ctx->module_hook_size                = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
+  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
+  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
+  module_ctx->module_kern_type                = module_kern_type;
+  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
+  module_ctx->module_opti_type                = module_opti_type;
+  module_ctx->module_opts_type                = module_opts_type;
+  module_ctx->module_outfile_check_disable    = MODULE_DEFAULT;
+  module_ctx->module_outfile_check_nocomp     = MODULE_DEFAULT;
+  module_ctx->module_potfile_custom_check     = MODULE_DEFAULT;
+  module_ctx->module_potfile_disable          = MODULE_DEFAULT;
+  module_ctx->module_potfile_keep_all_hashes  = MODULE_DEFAULT;
+  module_ctx->module_pwdump_column            = MODULE_DEFAULT;
+  module_ctx->module_pw_max                   = MODULE_DEFAULT;
+  module_ctx->module_pw_min                   = MODULE_DEFAULT;
+  module_ctx->module_salt_max                 = MODULE_DEFAULT;
+  module_ctx->module_salt_min                 = MODULE_DEFAULT;
+  module_ctx->module_salt_type                = module_salt_type;
+  module_ctx->module_separator                = MODULE_DEFAULT;
+  module_ctx->module_st_hash                  = module_st_hash;
+  module_ctx->module_st_pass                  = module_st_pass;
+  module_ctx->module_tmp_size                 = module_tmp_size;
+  module_ctx->module_unstable_warning         = MODULE_DEFAULT;
+  module_ctx->module_warmup_disable           = module_warmup_disable;
+}

From d9918d7e44020a526aa7bdc0e364f9c159a0c324 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Wed, 2 Jul 2025 11:02:57 +0200
Subject: [PATCH 23/37] Add Argon2 support for OpenCL and HIP
 =====================================

This patch modifies the existing Argon2 plugin, which was initially
designed to work only with CUDA. Supporting OpenCL and HIP required
broader architectural changes.

1. The tmps[] structure no longer holds the "large buffer". This
buffer stored the scratch areas for all password candidates in one
chunk. But we do not need to hold scratch areas for all candidates
simultaneously. All we need to do is hold chunks large enough
per password.

To simplify logic, the buffer is not divided by password count, but
divided by four, which fits within the "1/4 global memory" limit on
some OpenCL runtimes.

Hashcat already had logic to support this, but the buffer needed to be
moved to a different buffer type. It has now been relocated from the
"tmp buffer" to the "extra tmp buffer", following the same strategy
used in newer SCRYPT plugins.

This improves handling across several subcomponents:

  - Hashcat backend divides into four asymmetric buffers, hence the
    name "4-buffer strategy"
  - If the candidate count isn't divisible by 4, leftover candidates are
    assigned to the first (and possibly second and third) buffer
  - No code in the plugin is required, as this was designed for exactly
    such cases where future algorithms require a lot of memory
  - Plugin was rewritten to report the size needed in
    module_extra_tmp_size(), which triggers the "4-buffer" strategy
  - The split is not even, but each part is large enough to hold
    a multiple of a full scratch buffer for a password
  - The kernel code in m34000_init/loop/comp now uses a code block
    that finds its buffer by doing "group_id % 4"
  - Prevents the need to over-allocate memory to avoid OOB access
  - The original "tmps buffer" now holds a small dummy state buffer

2. Replaced warp shuffle instruction

The instruction __shfl_sync() is not available in runtimes
other than CUDA. Some have alternatives, some do not.

To prevent branching per backend runtime, the new general macro
hc__shfl_sync() replaces all calls to __shfl_sync().
This allows us to implement runtime-specific solutions and
take effect at compile time to prevent regressions.

- CUDA:
  We simply map to the original __shfl_sync()

- HIP:
  We map to shfl(), a built-in intrinsic. This instruction doesn't
  support masks like __shfl_sync() does, but masks are not needed
  in Argon2 anyway. It requires an additional parameter, the wavefront
  size. This is natively 64, but we hardcode this to 32 so it aligns
  with NVIDIA's warp size.

- OpenCL:
  - AMD: We have access to the instruction __builtin_amdgcn_ds_bpermute().
    This instruction only supports 32-bit integers, requiring us to
    pack and unpack the 64-bit values manually
  - NVIDIA: We use inline assembly with "shfl.sync.idx.b32". Same as
    with AMD, we need to pack and unpack 32-bit integers. The 64-bit
    support in CUDA is just overloaded and internally does the same thing.
  - Others: We use a shared memory pool and combine it with a barrier.
    This LOCAL_VK pool must be sized at compile time and transported to
    the Argon2 code in "inc_hash_argon2.cl". This required changing all
    function declarations that use shuffles slightly.

Unlock full threading for init and comp kernels
===============================================

This is implemented using a new flag:
  OPTS_TYPE_THREAD_MULTI_DISABLE

Behavior is similar to:
  OPTS_TYPE_MP_MULTI_DISABLE

It simply disables the multiplier normally applied to password batch size.

But attention, this change completely unbinds this effect from the
real threads spawned on the compute device. If the thread count is not
set to 1 in the plugin, it will start autotuning it.

In the case of Argon2, we hard-code it to 32 instead, which also changes
how "warp size" was used in the original implementation, and which is not
compatible with HIP and/or OpenCL. However, we need to maintain this thread
size to utilize warp shuffle and its alternatives in other runtimes.

Benefits:

  - Enables full threading for init and comp kernels (1667 H/s to 1722 H/s)
  - Allows future algorithms to enable parallel processing of single
    password candidates, if supported

Plugin changes:

  - Removed the "hack" where thread count = 1 disabled the multiplier
  - Removed per-device warp count detection code and struct changes
  - Removed warp handling and "num_elements / thread_count" division in
    the run_kernel() function

Simplified autotune logic for Argon2
====================================

The goal is to calculate the maximum number of password candidates that
can run in parallel, constrained only by device memory.

  - Removed all code related to Argon2 from autotune
  - Implemented in "module_extra_tuningdb_block()" (like SCRYPT)
  - We create a tuningdb entry at runtime!
  - Still allows override via tuningdb or CLI
  - Considers register spilling (read at startup)
  - Prevents global-to-host memory swap performance issues

Add Argon2I and ArgonD support
==============================

The kernel prepared from NFI already had support for the different Argon
types. No change was needed.

To support the other Argon2 types, the tokenizer had to be improved to
support a variety of different signatures in the same hash-mode.

Bugfixes
========

- Fixed missing entries in "switch_buffer_by_offset_8x4_le_S()"
- Fixed benchmark hash misdetection for scrypt. This was due to
  outdated logic used in scrypt to detect whether the plugin was
  called from a benchmark session or a regular one
- Fixed a bug in "module_hash_encode()" where Base64 padding '=' was
  retained
- Fixed missing "GLOBAL_AS" / "PRIVATE_AS" casts for OpenCL
- Fixed compiler warnings (e.g., "index_u32x4()", "get_group_id()")
  by adding return values
- Fixed a bug in token.len_max[6], which was allowing decoding
  of a 256-byte data into a 16-byte buffer (digest)

Other improvements
==================

- Added unit test module for automated testing
- Added support to the tokenizer to allow multiple signatures.
  Leave out TOKEN_ATTR_FIXED_LENGTH to enable this in your plugins
- Updated "hc_umulhi()", also exists for HIP
- Renamed "gid" to "bid" when using "get_group_id()" for clarity
- Removed "#ifdef IS_CUDA" as all backends are now supported
- Removed deprecated "OPTS_TYPE_MAXIMUM_ACCEL" attribute

Performance note
================

For testing, I used the self-test hash configured according to the
RFC 9106 recommendation: m=65536, t=3, p=1.

In my benchmarks, the AMD RX 7900 XTX achieved 1401 H/s using the same
hash that was used to test NVIDIA's RTX 4090. The RTX 4090 reached
1722 H/s, making it faster in absolute terms. However, at the time of
writing, it is more than three times as expensive as the 7900 XTX.

It's also worth noting that an older NVIDIA GTX 1080 Ti still reached
565 H/s with the same test vector, and may be found at significantly
lower cost.

Across all tested Argon2 configurations, the performance gap between
the RX 7900 XTX and the RTX 4090 remained proportionally consistent,
indicating a clear linear scaling relationship between the two GPUs.
---
 OpenCL/inc_common.cl         | 513 ++++++++++++++++++++++-
 OpenCL/inc_hash_argon2.cl    | 769 ++++++++++++++++++-----------------
 OpenCL/inc_hash_argon2.h     | 231 +++++++----
 OpenCL/inc_platform.cl       |  53 ++-
 OpenCL/inc_platform.h        |   3 +-
 OpenCL/m34000-pure.cl        |  72 +++-
 src/autotune.c               |   2 +-
 src/backend.c                |  13 +-
 src/modules/module_34000.c   | 198 ++++++---
 src/shared.c                 |  55 ++-
 tools/test_modules/m34000.pm |  79 ++++
 11 files changed, 1423 insertions(+), 565 deletions(-)
 create mode 100644 tools/test_modules/m34000.pm

diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index 1bb07b896..b13dc4d5f 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -1952,11 +1952,11 @@ DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
 
 DECLSPEC u32 hc_umulhi (const u32 x, const u32 y)
 {
-#if defined IS_CUDA
+  #if defined IS_CUDA || defined IS_HIP
   return __umulhi (x, y);
-#else
+  #else
   return h32_from_64_S ((u64) x * (u64) y);
-#endif
+  #endif
 }
 
 /**
@@ -41418,7 +41418,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
   #endif
 
   #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
-
   const int offset_mod_4 = offset & 3;
 
   const int offset_minus_4 = 4 - offset_mod_4;
@@ -42026,6 +42025,7 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
       w0[2] = 0;
       w0[1] = 0;
       w0[0] = 0;
+
       break;
 
     case 17:
@@ -42061,6 +42061,511 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
       w0[2] = 0;
       w0[1] = 0;
       w0[0] = 0;
+
+      break;
+
+    case 18:
+      w7[3] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[2] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[1] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[0] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 19:
+      w7[3] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[2] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[1] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[0] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 20:
+      w7[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 21:
+      w7[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 22:
+      w7[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 23:
+      w7[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 24:
+      w7[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 25:
+      w7[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 26:
+      w7[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 27:
+      w7[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 28:
+      w7[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 29:
+      w7[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 30:
+      w7[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 31:
+      w7[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[2] = 0;
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
       break;
   }
   #endif
diff --git a/OpenCL/inc_hash_argon2.cl b/OpenCL/inc_hash_argon2.cl
index a82531a2c..dc575f8da 100644
--- a/OpenCL/inc_hash_argon2.cl
+++ b/OpenCL/inc_hash_argon2.cl
@@ -1,383 +1,386 @@
-/**
- * Author......: Netherlands Forensic Institute
- * License.....: MIT
- *
- * Warp code based on original work by Ondrej Mosnáček
- */
-
-#include "inc_vendor.h"
-#include "inc_types.h"
-#include "inc_platform.h"
-#include "inc_common.h"
-#include "inc_hash_blake2b.h"
-#include "inc_hash_argon2.h"
-
-DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
-{
-  blake2b_ctx_t ctx;
-
-  blake2b_init (&ctx);
-
-  u64 blake_buf[16] = { 0 };
-
-  blake_buf[0] = sizeof(argon2_block_t);
-
-  blake2b_update (&ctx, (u32 *) blake_buf, 4);
-  blake2b_update (&ctx, in, 64);
-
-  blake_buf[0] = hl32_to_64 (lane, blocknum);
-
-  blake2b_update (&ctx, (u32 *) blake_buf, 8);
-
-  blake2b_final (&ctx);
-
-  u64 *out = blocks[(blocknum * parallelism) + lane].values;
-
-  out[0] = ctx.h[0];
-  out[1] = ctx.h[1];
-  out[2] = ctx.h[2];
-  out[3] = ctx.h[3];
-
-  for (u32 off = 4; off < 124; off += 4)
-  {
-    for (u32 idx = 0; idx < 8; idx++) blake_buf[idx] = ctx.h[idx];
-
-    blake2b_init (&ctx);
-    blake2b_transform (ctx.h, blake_buf, 64, BLAKE2B_FINAL);
-
-    out[off + 0] = ctx.h[0];
-    out[off + 1] = ctx.h[1];
-    out[off + 2] = ctx.h[2];
-    out[off + 3] = ctx.h[3];
-  }
-
-  out[124] = ctx.h[4];
-  out[125] = ctx.h[5];
-  out[126] = ctx.h[6];
-  out[127] = ctx.h[7];
-}
-
-DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, u64 *blockhash)
-{
-  blake2b_ctx_t ctx;
-  blake2b_init (&ctx);
-
-  u32 option_input[32] = { 0 };
-
-  option_input[0] = options->parallelism;
-  option_input[1] = options->digest_len;
-  option_input[2] = options->memory_usage_in_kib;
-  option_input[3] = options->iterations;
-  option_input[4] = options->version;
-  option_input[5] = options->type;
-
-  blake2b_update (&ctx, option_input, 24);
-
-  u32 len_input[32] = { 0 };
-
-  len_input[0] = pw->pw_len;
-
-  blake2b_update (&ctx, len_input, 4);
-  blake2b_update_global (&ctx, pw->i, pw->pw_len);
-
-  len_input[0] = salt->salt_len;
-
-  blake2b_update (&ctx, len_input, 4);
-  blake2b_update_global (&ctx, salt->salt_buf, salt->salt_len);
-
-  len_input[0] = 0;
-
-  blake2b_update (&ctx, len_input, 4); // secret (K)
-  blake2b_update (&ctx, len_input, 4); // associated data (X)
-
-  blake2b_final (&ctx);
-
-  for (u32 idx = 0; idx < 8; idx++) blockhash[idx] = ctx.h[idx];
-}
-
-DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
-                           const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
-{
-  u64 blockhash[16] = { 0 };
-
-  argon2_initial_hash (pw, salt, options, blockhash);
-
-  // Generate the first two blocks of each lane
-  for (u32 lane = 0; lane < options->parallelism; lane++)
-  {
-    argon2_initial_block ((u32 *) blockhash, lane, 0, options->parallelism, out);
-    argon2_initial_block ((u32 *) blockhash, lane, 1, options->parallelism, out);
-  }
-}
-
-DECLSPEC u64 trunc_mul (u64 x, u64 y)
-{
-  const u32 xlo = (u32) x;
-  const u32 ylo = (u32) y;
-  return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
-}
-
-DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const argon2_pos_t *pos, u32 index, u64 pseudo_random)
-{
-  u32 ref_lane;
-  u32 ref_area;
-  u32 ref_index;
-
-  if ((pos->pass == 0) && (pos->slice == 0))
-  {
-    ref_lane = pos->lane;
-  }
-  else
-  {
-    ref_lane = h32_from_64_S (pseudo_random) % options->parallelism;
-  }
-
-  ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
-  ref_area *= options->segment_length;
-
-  if ((ref_lane == pos->lane) || (index == 0))
-  {
-      ref_area += (index - 1);
-  }
-
-  const u32 j1 = l32_from_64_S (pseudo_random);
-  ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
-
-  if (pos->pass > 0)
-  {
-    ref_index += (pos->slice + 1) * options->segment_length;
-
-    if (ref_index >= options->lane_length)
-    {
-      ref_index -= options->lane_length;
-    }
-  }
-
-  return (options->parallelism * ref_index) + ref_lane;
-}
-
-DECLSPEC void swap_u64 (u64 *x, u64 *y)
-{
-  u64 tmp = *x;
-  *x = *y;
-  *y = tmp;
-}
-
-DECLSPEC void transpose_permute_block (u64 R[4], int thread)
-{
-  if (thread & 0x08)
-  {
-    swap_u64 (&R[0], &R[2]);
-    swap_u64 (&R[1], &R[3]);
-  }
-  if (thread & 0x04)
-  {
-    swap_u64 (&R[0], &R[1]);
-    swap_u64 (&R[2], &R[3]);
-  }
-}
-
-DECLSPEC int argon2_shift (int idx, int thread)
-{
-  const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
-  return (thread & 0x0e) | (((thread & 0x11) + delta + 0x0e) & 0x11);
-}
-
-DECLSPEC void argon2_hash_block (u64 R[4], int thread)
-{
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
-
-  transpose_permute_block (R, thread);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
-
-  ARGON2_G(R[0], R[1], R[2], R[3]);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx],  (thread & 0x1c) | ((thread + idx) & 0x03));
-
-  ARGON2_G(R[0], R[1], R[2], R[3]);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], ((thread & 0x1c) | ((thread - idx) & 0x03)) ^ (idx << 2));
-
-  transpose_permute_block (R, thread);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], thread ^ (idx << 2));
-
-  ARGON2_G(R[0], R[1], R[2], R[3]);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], argon2_shift (idx, thread));
-
-  ARGON2_G(R[0], R[1], R[2], R[3]);
-
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = __shfl_sync (FULL_MASK, R[idx], argon2_shift ((4 - idx), thread));
-}
-
-DECLSPEC void argon2_next_addresses (const argon2_options_t *options, const argon2_pos_t *pos, u32 *addresses, u32 start_index, u32 thread)
-{
-  u64 Z[4] = { 0 };
-  u64 tmp[4];
-
-  switch (thread)
-  {
-    case 0:  Z[0] = pos->pass;                   break;
-    case 1:  Z[0] = pos->lane;                   break;
-    case 2:  Z[0] = pos->slice;                  break;
-    case 3:  Z[0] = options->memory_block_count; break;
-    case 4:  Z[0] = options->iterations;         break;
-    case 5:  Z[0] = options->type;               break;
-    case 6:  Z[0] = (start_index / 128) + 1;     break;
-    default: Z[0] = 0;                           break;
-  }
-
-  tmp[0] = Z[0];
-
-  argon2_hash_block (Z, thread);
-
-  Z[0]  ^= tmp[0];
-
-  for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
-
-  argon2_hash_block (Z, thread);
-
-  for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
-
-  for (u32 i = 0, index = (start_index + thread); i < 4; i++, index += THREADS_PER_LANE)
-  {
-    addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
-  }
-}
-
-DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
-{
-  switch (index)
-  {
-    case 0:
-      return array[0];
-    case 1:
-      return array[1];
-    case 2:
-      return array[2];
-    case 3:
-      return array[3];
-  }
-}
-
-DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 thread)
-{
-  // Apply wrap-around to previous block index if the current block is the first block in the lane
-  const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
-
-  argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
-
-  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + thread];
-
-  return &blocks[(index_in_lane * options->parallelism) + lane];
-}
-
-DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
-                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 thread)
-{
-  for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
-  {
-    u32 ref_address;
-
-    if (indep_addr)
-    {
-      ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
-      ref_address = __shfl_sync (FULL_MASK, ref_address, index);
-    }
-    else
-    {
-      ref_address = argon2_ref_address (options, pos, index, R[0]);
-      ref_address = __shfl_sync (FULL_MASK, ref_address, 0);
-    }
-
-    GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
-
-    u64 tmp[4] = { 0 };
-
-    // First pass is overwrite, next passes are XOR with previous
-    if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
-    {
-      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + thread];
-    }
-
-    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + thread];
-
-    for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
-
-    argon2_hash_block (R, thread);
-
-    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
-
-    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + thread] = R[idx];
-  }
-}
-
-DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos)
-{
-  const u32  thread       = get_local_id(0);
-
-  // We have already generated the first two blocks of each lane (for the first pass)
-  const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
-  const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
-
-  u64 R[4];
-
-  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, thread);
-
-  if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
-  {
-    for (u32 block_index = 0; block_index < options->segment_length; block_index += 128)
-    {
-      const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
-      const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
-
-      u32 addresses[4];
-
-      argon2_next_addresses (options, pos, addresses, block_index, thread);
-      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, thread);
-
-      cur_block += (end_index - start_index) * options->parallelism;
-    }
-  }
-  else
-  {
-    u32 addresses[4] = { 0 };
-
-    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, thread);
-  }
-}
-
-DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out)
-{
-  const u32 lane_length = options->lane_length;
-  const u32 lanes = options->parallelism;
-
-  argon2_block_t final_block = { };
-
-  for (u32 l = 0; l < lanes; l++)
-  {
-    for (u32 idx = 0; idx < 128; idx++) final_block.values[idx] ^= blocks[((lane_length - 1) * lanes) + l].values[idx];
-  }
-
-  u32 output_len [32] = {0};
-  output_len [0] = options->digest_len;
-
-  blake2b_ctx_t ctx;
-  blake2b_init (&ctx);
-
-  // Override default (0x40) value in BLAKE2b
-  ctx.h[0] ^= 0x40 ^ options->digest_len; 
-
-  blake2b_update (&ctx, output_len, 4);
-  blake2b_update (&ctx, (u32 *) final_block.values, sizeof(final_block));
-
-  blake2b_final (&ctx);
-
-  for (int i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
-  {
-    out [i + 0] = l32_from_64_S (ctx.h[idx]);
-    out [i + 1] = h32_from_64_S (ctx.h[idx]);
-  }
-}
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ *
+ * Warp code based on original work by Ondrej Mosnáček
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_hash_blake2b.h"
+#include "inc_hash_argon2.h"
+
+DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
+{
+  blake2b_ctx_t ctx;
+
+  blake2b_init (&ctx);
+
+  u64 blake_buf[16] = { 0 };
+
+  blake_buf[0] = sizeof(argon2_block_t);
+
+  blake2b_update (&ctx, (u32 *) blake_buf, 4);
+  blake2b_update (&ctx, in, 64);
+
+  blake_buf[0] = hl32_to_64 (lane, blocknum);
+
+  blake2b_update (&ctx, (u32 *) blake_buf, 8);
+
+  blake2b_final (&ctx);
+
+  GLOBAL_AS u64 *out = blocks[(blocknum * parallelism) + lane].values;
+
+  out[0] = ctx.h[0];
+  out[1] = ctx.h[1];
+  out[2] = ctx.h[2];
+  out[3] = ctx.h[3];
+
+  for (u32 off = 4; off < 124; off += 4)
+  {
+    for (u32 idx = 0; idx < 8; idx++) blake_buf[idx] = ctx.h[idx];
+
+    blake2b_init (&ctx);
+    blake2b_transform (ctx.h, blake_buf, 64, BLAKE2B_FINAL);
+
+    out[off + 0] = ctx.h[0];
+    out[off + 1] = ctx.h[1];
+    out[off + 2] = ctx.h[2];
+    out[off + 3] = ctx.h[3];
+  }
+
+  out[124] = ctx.h[4];
+  out[125] = ctx.h[5];
+  out[126] = ctx.h[6];
+  out[127] = ctx.h[7];
+}
+
+DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, u64 *blockhash)
+{
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  u32 option_input[32] = { 0 };
+
+  option_input[0] = options->parallelism;
+  option_input[1] = options->digest_len;
+  option_input[2] = options->memory_usage_in_kib;
+  option_input[3] = options->iterations;
+  option_input[4] = options->version;
+  option_input[5] = options->type;
+
+  blake2b_update (&ctx, option_input, 24);
+
+  u32 len_input[32] = { 0 };
+
+  len_input[0] = pw->pw_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, pw->i, pw->pw_len);
+
+  len_input[0] = salt->salt_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, salt->salt_buf, salt->salt_len);
+
+  len_input[0] = 0;
+
+  blake2b_update (&ctx, len_input, 4); // secret (K)
+  blake2b_update (&ctx, len_input, 4); // associated data (X)
+
+  blake2b_final (&ctx);
+
+  for (u32 idx = 0; idx < 8; idx++) blockhash[idx] = ctx.h[idx];
+}
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
+                           const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
+{
+  u64 blockhash[16] = { 0 };
+
+  argon2_initial_hash (pw, salt, options, blockhash);
+
+  // Generate the first two blocks of each lane
+  for (u32 lane = 0; lane < options->parallelism; lane++)
+  {
+    argon2_initial_block ((u32 *) blockhash, lane, 0, options->parallelism, out);
+    argon2_initial_block ((u32 *) blockhash, lane, 1, options->parallelism, out);
+  }
+}
+
+// TODO: reconsider 'trunc_mul()'
+DECLSPEC u64 trunc_mul (u64 x, u64 y)
+{
+  const u32 xlo = (u32) x;
+  const u32 ylo = (u32) y;
+  return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
+}
+
+DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const argon2_pos_t *pos, u32 index, u64 pseudo_random)
+{
+  u32 ref_lane;
+  u32 ref_area;
+  u32 ref_index;
+
+  if ((pos->pass == 0) && (pos->slice == 0))
+  {
+    ref_lane = pos->lane;
+  }
+  else
+  {
+    ref_lane = h32_from_64_S (pseudo_random) % options->parallelism;
+  }
+
+  ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
+  ref_area *= options->segment_length;
+
+  if ((ref_lane == pos->lane) || (index == 0))
+  {
+      ref_area += (index - 1);
+  }
+
+  const u32 j1 = l32_from_64_S (pseudo_random);
+  ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
+
+  if (pos->pass > 0)
+  {
+    ref_index += (pos->slice + 1) * options->segment_length;
+
+    if (ref_index >= options->lane_length)
+    {
+      ref_index -= options->lane_length;
+    }
+  }
+
+  return (options->parallelism * ref_index) + ref_lane;
+}
+
+DECLSPEC void swap_u64 (u64 *x, u64 *y)
+{
+  u64 tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+DECLSPEC void transpose_permute_block (u64 R[4], int thread)
+{
+  if (thread & 0x08)
+  {
+    swap_u64 (&R[0], &R[2]);
+    swap_u64 (&R[1], &R[3]);
+  }
+  if (thread & 0x04)
+  {
+    swap_u64 (&R[0], &R[1]);
+    swap_u64 (&R[2], &R[3]);
+  }
+}
+
+DECLSPEC int argon2_shift (int idx, int thread)
+{
+  const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
+  return (thread & 0x0e) | (((thread & 0x11) + delta + 0x0e) & 0x11);
+}
+
+DECLSPEC void argon2_hash_block (u64 R[4], int thread, LOCAL_AS u64 *shuffle_buf)
+{
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  transpose_permute_block (R, thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx],  (thread & 0x1c) | ((thread + idx) & 0x03));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], ((thread & 0x1c) | ((thread - idx) & 0x03)) ^ (idx << 2));
+
+  transpose_permute_block (R, thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift (idx, thread));
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift ((4 - idx), thread));
+}
+
+DECLSPEC void argon2_next_addresses (const argon2_options_t *options, const argon2_pos_t *pos, u32 *addresses, u32 start_index, u32 thread, LOCAL_AS u64 *shuffle_buf)
+{
+  u64 Z[4] = { 0 };
+  u64 tmp[4];
+
+  switch (thread)
+  {
+    case 0:  Z[0] = pos->pass;                   break;
+    case 1:  Z[0] = pos->lane;                   break;
+    case 2:  Z[0] = pos->slice;                  break;
+    case 3:  Z[0] = options->memory_block_count; break;
+    case 4:  Z[0] = options->iterations;         break;
+    case 5:  Z[0] = options->type;               break;
+    case 6:  Z[0] = (start_index / 128) + 1;     break;
+    default: Z[0] = 0;                           break;
+  }
+
+  tmp[0] = Z[0];
+
+  argon2_hash_block (Z, thread, shuffle_buf);
+
+  Z[0]  ^= tmp[0];
+
+  for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
+
+  argon2_hash_block (Z, thread, shuffle_buf);
+
+  for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
+
+  for (u32 i = 0, index = (start_index + thread); i < 4; i++, index += THREADS_PER_LANE)
+  {
+    addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
+  }
+}
+
+DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
+{
+  switch (index)
+  {
+    case 0:
+      return array[0];
+    case 1:
+      return array[1];
+    case 2:
+      return array[2];
+    case 3:
+      return array[3];
+  }
+
+  return -1;
+}
+
+DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 thread)
+{
+  // Apply wrap-around to previous block index if the current block is the first block in the lane
+  const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
+
+  GLOBAL_AS argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
+
+  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + thread];
+
+  return &blocks[(index_in_lane * options->parallelism) + lane];
+}
+
+DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
+                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 thread, LOCAL_AS u64 *shuffle_buf)
+{
+  for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
+  {
+    u32 ref_address;
+
+    if (indep_addr)
+    {
+      ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, index);
+    }
+    else
+    {
+      ref_address = argon2_ref_address (options, pos, index, R[0]);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, 0);
+    }
+
+    GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
+
+    u64 tmp[4] = { 0 };
+
+    // First pass is overwrite, next passes are XOR with previous
+    if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
+    {
+      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + thread];
+    }
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + thread];
+
+    for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
+
+    argon2_hash_block (R, thread, shuffle_buf);
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
+
+    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + thread] = R[idx];
+  }
+}
+
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf)
+{
+  const u32  thread       = get_local_id(0);
+
+  // We have already generated the first two blocks of each lane (for the first pass)
+  const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
+  const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
+
+  u64 R[4];
+
+  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, thread);
+
+  if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
+  {
+    for (u32 block_index = 0; block_index < options->segment_length; block_index += 128)
+    {
+      const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
+      const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
+
+      u32 addresses[4];
+
+      argon2_next_addresses (options, pos, addresses, block_index, thread, shuffle_buf);
+      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, thread, shuffle_buf);
+
+      cur_block += (end_index - start_index) * options->parallelism;
+    }
+  }
+  else
+  {
+    u32 addresses[4] = { 0 };
+
+    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, thread, shuffle_buf);
+  }
+}
+
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out)
+{
+  const u32 lane_length = options->lane_length;
+  const u32 lanes = options->parallelism;
+
+  argon2_block_t final_block = { };
+
+  for (u32 l = 0; l < lanes; l++)
+  {
+    for (u32 idx = 0; idx < 128; idx++) final_block.values[idx] ^= blocks[((lane_length - 1) * lanes) + l].values[idx];
+  }
+
+  u32 output_len [32] = {0};
+  output_len [0] = options->digest_len;
+
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  // Override default (0x40) value in BLAKE2b
+  ctx.h[0] ^= 0x40 ^ options->digest_len; 
+
+  blake2b_update (&ctx, output_len, 4);
+  blake2b_update (&ctx, (u32 *) final_block.values, sizeof(final_block));
+
+  blake2b_final (&ctx);
+
+  for (int i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
+  {
+    out [i + 0] = l32_from_64_S (ctx.h[idx]);
+    out [i + 1] = h32_from_64_S (ctx.h[idx]);
+  }
+}
diff --git a/OpenCL/inc_hash_argon2.h b/OpenCL/inc_hash_argon2.h
index 8980068cd..922afc44a 100644
--- a/OpenCL/inc_hash_argon2.h
+++ b/OpenCL/inc_hash_argon2.h
@@ -1,84 +1,147 @@
-/**
- * Author......: Netherlands Forensic Institute
- * License.....: MIT
- */
-
-#ifndef INC_HASH_ARGON2_H
-#define INC_HASH_ARGON2_H
-
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ARGON2_VERSION_10 0x10
-#define ARGON2_VERSION_13 0x13
-
-#define THREADS_PER_LANE 32
-#define FULL_MASK 0xffffffff
-
-#define BLAKE2B_OUTBYTES 64
-#define ARGON2_SYNC_POINTS 4
-#define ARGON2_ADDRESSES_IN_BLOCK 128
-
-#define TYPE_D  0
-#define TYPE_I  1
-#define TYPE_ID 2
-
-#define ARGON2_G(a,b,c,d)                \
-{                                        \
-  a = a + b + 2 * trunc_mul(a, b);       \
-  d = blake2b_rot32_S (d ^ a);           \
-  c = c + d + 2 * trunc_mul(c, d);       \
-  b = blake2b_rot24_S (b ^ c);           \
-  a = a + b + 2 * trunc_mul(a, b);       \
-  d = blake2b_rot16_S (d ^ a);           \
-  c = c + d + 2 * trunc_mul(c, d);       \
-  b = hc_rotr64_S (b ^ c, 63);           \
-}
-
-#define ARGON2_P()                       \
-{                                        \
-  ARGON2_G(v[0], v[4], v[8], v[12]);     \
-  ARGON2_G(v[1], v[5], v[9], v[13]);     \
-  ARGON2_G(v[2], v[6], v[10], v[14]);    \
-  ARGON2_G(v[3], v[7], v[11], v[15]);    \
-                                         \
-  ARGON2_G(v[0], v[5], v[10], v[15]);    \
-  ARGON2_G(v[1], v[6], v[11], v[12]);    \
-  ARGON2_G(v[2], v[7], v[8], v[13]);     \
-  ARGON2_G(v[3], v[4], v[9], v[14]);     \
-}
-
-typedef struct argon2_block
-{
-  u64 values[128];
-
-} argon2_block_t;
-
-typedef struct argon2_options
-{
-  u32 type;
-  u32 version;
-
-  u32 iterations;
-  u32 parallelism;
-  u32 memory_usage_in_kib;
-
-  u32 segment_length;
-  u32 lane_length;
-  u32 memory_block_count;
-  u32 digest_len;
-
-} argon2_options_t;
-
-typedef struct argon2_pos
-{
-  u32 pass;
-  u32 slice;
-  u32 lane;
-
-} argon2_pos_t;
-
-DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
-DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos);
-DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out);
-
-#endif // INC_HASH_ARGON2_H
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_ARGON2_H
+#define INC_HASH_ARGON2_H
+
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
+#define THREADS_PER_LANE 32
+#define FULL_MASK 0xffffffff
+
+#define BLAKE2B_OUTBYTES 64
+#define ARGON2_SYNC_POINTS 4
+#define ARGON2_ADDRESSES_IN_BLOCK 128
+
+#define TYPE_D  0
+#define TYPE_I  1
+#define TYPE_ID 2
+
+#if defined IS_CUDA
+#define hc__shfl_sync(shfbuf,mask,var,srcLane) __shfl_sync ((mask),(var),(srcLane))
+#elif defined IS_HIP
+// attention hard coded 32 warps for hip here
+#define hc__shfl_sync(shfbuf,mask,var,srcLane) __shfl ((var),(srcLane),32)
+#elif defined IS_OPENCL
+#define hc__shfl_sync(shfbuf,mask,var,srcLane) hc__shfl ((shfbuf),(var),(srcLane))
+
+#if defined IS_AMD && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+{
+  const u32 idx = src_lane << 2;
+
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  const u32 l32r = __builtin_amdgcn_ds_bpermute (idx, l32);
+  const u32 h32r = __builtin_amdgcn_ds_bpermute (idx, h32);
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#elif defined IS_NV && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+{
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  u32 l32r;
+  u32 h32r;
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(l32r)
+      : "r"(l32), "r"(src_lane));
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(h32r)
+      : "r"(h32), "r"(src_lane));
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#else
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+{
+  const u32 lid = get_local_id (0);
+
+  shuffle_buf[lid] = var;
+
+  barrier (CLK_LOCAL_MEM_FENCE);
+
+  const u64 out = shuffle_buf[src_lane & 31];
+
+  return out;
+}
+#endif
+
+#elif defined IS_METAL
+//todo
+#endif
+
+#define ARGON2_G(a,b,c,d)                \
+{                                        \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot32_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = blake2b_rot24_S (b ^ c);           \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot16_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = hc_rotr64_S (b ^ c, 63);           \
+}
+
+#define ARGON2_P()                       \
+{                                        \
+  ARGON2_G(v[0], v[4], v[8], v[12]);     \
+  ARGON2_G(v[1], v[5], v[9], v[13]);     \
+  ARGON2_G(v[2], v[6], v[10], v[14]);    \
+  ARGON2_G(v[3], v[7], v[11], v[15]);    \
+                                         \
+  ARGON2_G(v[0], v[5], v[10], v[15]);    \
+  ARGON2_G(v[1], v[6], v[11], v[12]);    \
+  ARGON2_G(v[2], v[7], v[8], v[13]);     \
+  ARGON2_G(v[3], v[4], v[9], v[14]);     \
+}
+
+typedef struct argon2_block
+{
+  u64 values[128];
+
+} argon2_block_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+  u32 digest_len;
+
+} argon2_options_t;
+
+typedef struct argon2_pos
+{
+  u32 pass;
+  u32 slice;
+  u32 lane;
+
+} argon2_pos_t;
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf);
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out);
+
+#endif // INC_HASH_ARGON2_H
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 36303296f..0a386f22e 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -114,7 +114,9 @@ DECLSPEC size_t get_group_id (const u32 dimindx)
       return blockIdx.y;
     case 2:
       return blockIdx.z;
-  }  
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@@ -133,6 +135,8 @@ DECLSPEC size_t get_local_id (const u32 dimindx)
     case 2:
       return threadIdx.z;
   }
+
+  return (size_t) -1;
 }
 
 DECLSPEC size_t get_local_size (const u32 dimindx)
@@ -145,7 +149,9 @@ DECLSPEC size_t get_local_size (const u32 dimindx)
       return blockDim.y;
     case 2:
       return blockDim.z;
-  }  
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC u32x rotl32 (const u32x a, const int n)
@@ -328,9 +334,19 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@@ -338,15 +354,34 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
   return (blockIdx.x * blockDim.x) + threadIdx.x;
 }
 
-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC u32x rotl32 (const u32x a, const int n)
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index 4e203ff18..e1ffdefcf 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -49,7 +49,8 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
-DECLSPEC size_t get_global_id   (const u32 dimindx);
+DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
 DECLSPEC size_t get_local_id    (const u32 dimindx);
 DECLSPEC size_t get_local_size  (const u32 dimindx);
 
diff --git a/OpenCL/m34000-pure.cl b/OpenCL/m34000-pure.cl
index 8c17a523d..483adf59c 100644
--- a/OpenCL/m34000-pure.cl
+++ b/OpenCL/m34000-pure.cl
@@ -14,6 +14,12 @@
 #define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl)
 
 typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
+typedef struct argon2_extra
 {
 #ifndef ARGON2_TMP_ELEM
 #define ARGON2_TMP_ELEM 1
@@ -21,26 +27,59 @@ typedef struct argon2_tmp
 
   argon2_block_t blocks[ARGON2_TMP_ELEM];
 
-} argon2_tmp_t;
+} argon2_extra_t;
 
-KERNEL_FQ void m34000_init (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+KERNEL_FQ KERNEL_FA void m34000_init (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
 {
   const u64 gid = get_global_id (0);
 
   if (gid >= GID_CNT) return;
 
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (gm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + gd4;
+
   const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
 
-  argon2_init (&pws[gid], &salt_bufs[SALT_POS_HOST], &options, tmps[gid].blocks);
+  argon2_init (&pws[gid], &salt_bufs[SALT_POS_HOST], &options, argon2_extra->blocks);
 }
 
-KERNEL_FQ void m34000_loop (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
 {
-  const u64 gid = get_group_id (0);
+  const u64 bid = get_group_id (0);
   const u64 lid = get_local_id (1);
   const u64 lsz = get_local_size (1);
 
-  if (gid >= GID_CNT) return;
+  if (bid >= GID_CNT) return;
+
+  LOCAL_VK u64 shuffle_bufs[ARGON2_PARALLELISM][32];
+  LOCAL_AS u64 *shuffle_buf = shuffle_bufs[lid];
+
+  const u32 bd4 = bid / 4;
+  const u32 bm4 = bid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (bm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + bd4;
 
   argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
 
@@ -55,7 +94,7 @@ KERNEL_FQ void m34000_loop (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_
   {
     for (pos.lane = lid; pos.lane < options.parallelism; pos.lane += lsz)
     {
-      argon2_fill_segment (tmps[gid].blocks, &options, &pos);
+      argon2_fill_segment (argon2_extra->blocks, &options, &pos, shuffle_buf);
     }
 
     SYNC_THREADS ();
@@ -70,17 +109,32 @@ KERNEL_FQ void m34000_loop (_KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_
   }
 }
 
-KERNEL_FQ void m34000_comp ( _KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+KERNEL_FQ KERNEL_FA void m34000_comp (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
 {
   const u64 gid = get_global_id (0);
 
   if (gid >= GID_CNT) return;
 
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (gm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + gd4;
+
   u32 out[8];
 
   const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
 
-  argon2_final (tmps[gid].blocks, &options, out);
+  argon2_final (argon2_extra->blocks, &options, out);
 
   const u32 r0 = out[0];
   const u32 r1 = out[1];
diff --git a/src/autotune.c b/src/autotune.c
index 891baa472..a599e65be 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -107,7 +107,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   const double target_msec = backend_ctx->target_msec;
 
-  const u32 kernel_accel_min =  (hashconfig->opts_type & OPTS_TYPE_MAXIMUM_ACCEL) ? device_param->kernel_accel_max : device_param->kernel_accel_min;
+  const u32 kernel_accel_min = device_param->kernel_accel_min;
   const u32 kernel_accel_max = device_param->kernel_accel_max;
 
   const u32 kernel_loops_min = device_param->kernel_loops_min;
diff --git a/src/backend.c b/src/backend.c
index 8c5dc47d5..c10cf769f 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2662,16 +2662,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
 
-    if ((kern_run == KERN_RUN_2) && (hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP))
-    {
-      const u32 warp_size = device_param->kernel_preferred_wgs_multiple;
-
-      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num, 1, 1, warp_size, blockDimY, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
-    }
-    else
-    {
-      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
-    }
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
 
@@ -16286,7 +16277,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     // Still not 100% sure about the 64MiB here
 
-    const u64 size_device_extra = MAX ((1024 * 1024 * 1024), size_device_extra1234);
+    const u64 size_device_extra = MAX ((64ULL * 1024 * 1024), size_device_extra1234);
 
     // we will first decrease accel and when reached that limit, we will decrease threads
     // when we decrease limit this will restore accel_max
diff --git a/src/modules/module_34000.c b/src/modules/module_34000.c
index 64056b36f..60a37358c 100644
--- a/src/modules/module_34000.c
+++ b/src/modules/module_34000.c
@@ -9,6 +9,7 @@
 #include "bitops.h"
 #include "convert.h"
 #include "shared.h"
+#include "memory.h"
 
 #define ARGON2_SYNC_POINTS  4
 #define ARGON2_BLOCK_SIZE   1024
@@ -18,17 +19,16 @@ static const u32   DGST_POS0      = 0;
 static const u32   DGST_POS1      = 1;
 static const u32   DGST_POS2      = 2;
 static const u32   DGST_POS3      = 3;
-static const u32   DGST_SIZE      = DGST_SIZE_4_8;
+static const u32   DGST_SIZE      = DGST_SIZE_8_16;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_GENERIC_KDF;
-static const char *HASH_NAME      = "Argon2ID";
+static const char *HASH_NAME      = "Argon2";
 static const u64   KERN_TYPE      = 34000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_SLOW_HASH_DIMY_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
-                                  | OPTS_TYPE_NATIVE_THREADS
-                                  | OPTS_TYPE_MP_MULTI_DISABLE
-                                  | OPTS_TYPE_MAXIMUM_ACCEL;
+                                  | OPTS_TYPE_THREAD_MULTI_DISABLE
+                                  | OPTS_TYPE_MP_MULTI_DISABLE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
 static const char *ST_PASS        = "hashcat";
 static const char *ST_HASH        = "$argon2id$v=19$m=65536,t=3,p=1$FBMjI4RJBhIykCgol1KEJA$2ky5GAdhT1kH4kIgPN/oERE3Taiy43vNN70a3HpiKQU";
@@ -48,6 +48,12 @@ u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig,
 const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
 const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
 
+typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
 typedef struct argon2_options
 {
   u32 type;
@@ -65,6 +71,8 @@ typedef struct argon2_options
 
 } argon2_options_t;
 
+static const char *SIGNATURE_ARGON2D  = "$argon2d$";
+static const char *SIGNATURE_ARGON2I  = "$argon2i$";
 static const char *SIGNATURE_ARGON2ID = "$argon2id$";
 
 u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@@ -76,39 +84,18 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
 
 u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_min = 1;
+  const u32 kernel_threads_min = 32; // hard-coded in kernel
 
   return kernel_threads_min;
 }
 
 u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
-  const u32 kernel_threads_max = 1;
+  const u32 kernel_threads_max = 32; // hard-coded in kernel
 
   return kernel_threads_max;
 }
 
-u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_loops_min = 1;
-
-  return kernel_loops_min;
-}
-
-u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_loops_max = 1;
-
-  return kernel_loops_max;
-}
-
-bool module_warmup_disable (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const bool warmup_disable = true;
-
-  return warmup_disable;
-}
-
 u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u64 tmp_size = 0; // we'll add some later
@@ -116,28 +103,104 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
+{
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  int   lines_sz  = 4096;
+  char *lines_buf = hcmalloc (lines_sz);
+  int   lines_pos = 0;
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
+
+  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
+
+  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel_user)
+  {
+    kernel_accel_new = kernel_accel_user;
+  }
+  else
+  {
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      kernel_accel_new = available_mem / size_per_accel;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
+  return lines_buf;
+}
+
+u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
+
+  return size_argon2;
+}
+
 u64 module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
 {
-  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
 
-  const u32 memory_block_count = options[0].memory_block_count;
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
 
-  // we need to check that all hashes have the same memory requirement
   for (u32 i = 1; i < hashes->salts_cnt; i++)
   {
-    if (options[i].memory_block_count != memory_block_count) return (1ULL << 63) + i;
+    if ((memory_block_count != options[i].memory_block_count)
+     || (parallelism        != options[i].parallelism))
+    {
+      return (1ULL << 63) + i;
+    }
   }
 
   // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
 
-  if (user_options->self_test == true)
+  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
   {
-    argon2_options_t *st_options = (argon2_options_t *) hashes->st_esalts_buf;
-
-    if (st_options[0].memory_block_count != memory_block_count) return (1ULL << 62);
+    if ((memory_block_count != options_st->memory_block_count)
+     || (parallelism        != options_st->parallelism))
+    {
+      return (1ULL << 62);
+    }
   }
 
-  const u64 tmp_size = ARGON2_BLOCK_SIZE * memory_block_count;
+  u64 tmp_size = sizeof (argon2_tmp_t);
 
   return tmp_size;
 }
@@ -148,7 +211,7 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY
 
   char *jit_build_options = NULL;
 
-  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%" PRIu32 " -D ARGON2_TMP_ELEM=%" PRIu32, options[0].parallelism, options[0].memory_block_count);
+  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
 
   return jit_build_options;
 }
@@ -165,12 +228,15 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
 
   token.token_cnt  = 7;
 
-  token.signatures_cnt    = 1;
-  token.signatures_buf[0] = SIGNATURE_ARGON2ID;
+  token.signatures_cnt    = 3;
+  token.signatures_buf[0] = SIGNATURE_ARGON2D;
+  token.signatures_buf[1] = SIGNATURE_ARGON2I;
+  token.signatures_buf[2] = SIGNATURE_ARGON2ID;
 
-  token.len[0]     = 10;
-  token.attr[0]    = TOKEN_ATTR_FIXED_LENGTH
-                   | TOKEN_ATTR_VERIFY_SIGNATURE;
+  token.len_min[0] = 9;
+  token.len_max[0] = 10;
+  token.sep[0]     = 0;
+  token.attr[0]    = TOKEN_ATTR_VERIFY_SIGNATURE;
 
   // version
   token.len[1]     = 4;
@@ -203,8 +269,8 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
                    | TOKEN_ATTR_VERIFY_BASE64A;
 
   // target hash
-  token.len_min[6] = ((SALT_MIN * 8) / 6) + 0;
-  token.len_max[6] = ((SALT_MAX * 8) / 6) + 3;
+  token.len_min[6] = ((  1 * 8) / 6) + 0;
+  token.len_max[6] = ((128 * 8) / 6) + 3;
   token.sep[6]     = '$';
   token.attr[6]    = TOKEN_ATTR_VERIFY_LENGTH
                    | TOKEN_ATTR_VERIFY_BASE64A;
@@ -213,13 +279,23 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
 
   if (rc_tokenizer != PARSER_OK) return (rc_tokenizer);
 
+  // signature sets argon2 typ
+
+  const int sig_len = token.len[0];
+  const u8 *sig_pos = token.buf[0];
+
+  if      (memcmp (SIGNATURE_ARGON2D,  sig_pos, sig_len) == 0) options->type = 0;
+  else if (memcmp (SIGNATURE_ARGON2I,  sig_pos, sig_len) == 0) options->type = 1;
+  else if (memcmp (SIGNATURE_ARGON2ID, sig_pos, sig_len) == 0) options->type = 2;
+  else
+    return (PARSER_SIGNATURE_UNMATCHED);
+
   // argon2id config
   const u8 *ver_pos = token.buf[1];
   const u8 *mem_pos = token.buf[2];
   const u8 *it_pos  = token.buf[3];
   const u8 *par_pos = token.buf[4];
 
-  options->type                = 2; // Only support for Argon2id
   options->version             = hc_strtoul ((const char *) ver_pos + 2, NULL, 10);
   options->memory_usage_in_kib = hc_strtoul ((const char *) mem_pos + 2, NULL, 10);
   options->iterations          = hc_strtoul ((const char *) it_pos  + 2, NULL, 10);
@@ -259,17 +335,31 @@ int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
 
   // salt
   char base64_salt[512] = { 0 };
-  base64_encode (int_to_base64, (const u8 *) salt->salt_buf, salt->salt_len, (u8 *) base64_salt);
+  int len1 = base64_encode (int_to_base64, (const u8 *) salt->salt_buf, salt->salt_len, (u8 *) base64_salt);
+
+  for (int i = len1 - 1; i >=0; i--) if (base64_salt[i] == '=') base64_salt[i] = 0;
 
   // digest
   char base64_digest[512] = { 0 };
-  base64_encode (int_to_base64, (const u8 *) digest, options->digest_len, (u8 *) base64_digest);
+  int len2 = base64_encode (int_to_base64, (const u8 *) digest, options->digest_len, (u8 *) base64_digest);
+
+  for (int i = len2 - 1; i >=0; i--) if (base64_digest[i] == '=') base64_digest[i] = 0;
 
   // out
+
+  const char *signature = NULL;
+
+  switch (options->type)
+  {
+    case 0: signature = SIGNATURE_ARGON2D;  break;
+    case 1: signature = SIGNATURE_ARGON2I;  break;
+    case 2: signature = SIGNATURE_ARGON2ID; break;
+  }
+
   u8 *out_buf = (u8 *) line_buf;
 
   const int out_len = snprintf ((char *) out_buf, line_size, "%sv=%d$m=%d,t=%d,p=%d$%s$%s",
-    SIGNATURE_ARGON2ID,
+    signature,
     options->version,
     options->memory_usage_in_kib,
     options->iterations,
@@ -303,9 +393,9 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_dgst_size                = module_dgst_size;
   module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
   module_ctx->module_esalt_size               = module_esalt_size;
-  module_ctx->module_extra_buffer_size        = MODULE_DEFAULT;
+  module_ctx->module_extra_buffer_size        = module_extra_buffer_size;
   module_ctx->module_extra_tmp_size           = module_extra_tmp_size;
-  module_ctx->module_extra_tuningdb_block     = MODULE_DEFAULT;
+  module_ctx->module_extra_tuningdb_block     = module_extra_tuningdb_block;
   module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
   module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
   module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
@@ -335,8 +425,8 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
+  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
   module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
@@ -359,5 +449,5 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_st_pass                  = module_st_pass;
   module_ctx->module_tmp_size                 = module_tmp_size;
   module_ctx->module_unstable_warning         = MODULE_DEFAULT;
-  module_ctx->module_warmup_disable           = module_warmup_disable;
+  module_ctx->module_warmup_disable           = MODULE_DEFAULT;
 }
diff --git a/src/shared.c b/src/shared.c
index f7dbcd78d..0ff206815 100644
--- a/src/shared.c
+++ b/src/shared.c
@@ -105,8 +105,8 @@ static const char *const OPTI_STR_USES_BITS_16         = "Uses-16-Bit";
 static const char *const OPTI_STR_USES_BITS_32         = "Uses-32-Bit";
 static const char *const OPTI_STR_USES_BITS_64         = "Uses-64-Bit";
 static const char *const OPTI_STR_SLOW_HASH_DIMY_INIT  = "Slow-Hash-DimensionY-INIT";
-static const char *const OPTI_STR_SLOW_HASH_DIMY_COMP  = "Slow-Hash-DimensionY-LOOP";
-static const char *const OPTI_STR_SLOW_HASH_DIMY_LOOP  = "Slow-Hash-DimensionY-COMP";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_LOOP  = "Slow-Hash-DimensionY-LOOP";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_COMP  = "Slow-Hash-DimensionY-COMP";
 
 static const char *const HASH_CATEGORY_UNDEFINED_STR              = "Undefined";
 static const char *const HASH_CATEGORY_RAW_HASH_STR               = "Raw Hash";
@@ -1252,15 +1252,52 @@ int input_tokenizer (const u8 *input_buf, const int input_len, hc_token_t *token
     {
       const int len = token->len[token_idx];
 
-      token->buf[token_idx + 1] = token->buf[token_idx] + len;
-
-      len_left -= len;
-
-      if (token->sep[token_idx] != 0)
+      if (len)
       {
-        token->buf[token_idx + 1]++; // +1 = separator
+        token->buf[token_idx + 1] = token->buf[token_idx] + len;
 
-        len_left--; // -1 = separator
+        len_left -= len;
+
+        if (token->sep[token_idx] != 0)
+        {
+          token->buf[token_idx + 1]++; // +1 = separator
+
+          len_left--; // -1 = separator
+        }
+      }
+
+      const int len_min = token->len_min[token_idx];
+      const int len_max = token->len_max[token_idx];
+
+      if (len_max)
+      {
+        bool matched = false;
+
+        if (token->attr[token_idx] & TOKEN_ATTR_VERIFY_SIGNATURE)
+        {
+          for (int signature_idx = 0; signature_idx < token->signatures_cnt; signature_idx++)
+          {
+            const int len_sig = strlen (token->signatures_buf[signature_idx]);
+
+            if (len_sig > len_left) continue;
+
+            if ((len_sig >= len_min) && (len_sig <= len_max))
+            {
+              if (memcmp (token->buf[token_idx], token->signatures_buf[signature_idx], len_sig) == 0)
+              {
+                token->len[token_idx] = len_sig;
+
+                token->buf[token_idx + 1] = token->buf[token_idx] + len_sig;
+
+                len_left -= len_sig;
+
+                matched = true;
+              }
+            }
+          }
+
+          if (matched == false) return (PARSER_SIGNATURE_UNMATCHED);
+        }
       }
     }
   }
diff --git a/tools/test_modules/m34000.pm b/tools/test_modules/m34000.pm
new file mode 100644
index 000000000..7b2e671ff
--- /dev/null
+++ b/tools/test_modules/m34000.pm
@@ -0,0 +1,79 @@
+#!/usr/bin/env perl
+
+##
+## Author......: See docs/credits.txt
+## License.....: MIT
+##
+
+use strict;
+use warnings;
+
+use MIME::Base64  qw (decode_base64 encode_base64);
+use Crypt::Argon2 qw (argon2_raw);
+
+sub module_constraints { [[0, 256], [32, 32], [-1, -1], [-1, -1], [-1, -1]] }
+
+sub module_generate_hash
+{
+  my $word  = shift;
+  my $salt  = shift;
+  my $sign  = shift // ("argon2d","argon2i","argon2id")[random_number (0, 2)];
+  my $m     = shift // 65536;
+  my $t     = shift // 3;
+  my $p     = shift // 1;
+  my $len   = shift // random_number (1, 2) * 16;
+
+  my $salt_bin = pack ("H*", $salt);
+
+  my $digest_bin = argon2_raw ($sign, $word, $salt_bin, $t, $m . "k", $p, $len);
+
+  my $salt_base64   = encode_base64 ($salt_bin,   ""); $salt_base64   =~ s/=+$//;
+  my $digest_base64 = encode_base64 ($digest_bin, ""); $digest_base64 =~ s/=+$//;
+
+  my $hash = sprintf ('$%s$v=19$m=%d,t=%d,p=%d$%s$%s', $sign, $m, $t, $p, $salt_base64, $digest_base64);
+
+  return $hash;
+}
+
+sub module_verify_hash
+{
+  my $line = shift;
+
+  my $idx = index ($line, ':');
+
+  return unless $idx >= 0;
+
+  my $hash = substr ($line, 0, $idx);
+  my $word = substr ($line, $idx + 1);
+
+  return unless ((substr ($hash, 0,  9) eq '$argon2d$')
+              || (substr ($hash, 0,  9) eq '$argon2i$')
+              || (substr ($hash, 0, 10) eq '$argon2id$'));              
+
+  my (undef, $signature, $version, $config, $salt, $digest) = split '\$', $hash;
+
+  return unless defined $signature;
+  return unless defined $version;
+  return unless defined $config;
+  return unless defined $salt;
+  return unless defined $digest;
+
+  my ($m_config, $t_config, $p_config) = split ("\,", $config);
+
+  return unless ($version eq "v=19");
+
+  my $m = (split ("=", $m_config))[1];
+  my $t = (split ("=", $t_config))[1];
+  my $p = (split ("=", $p_config))[1];
+
+  $salt   = decode_base64 ($salt);
+  $digest = decode_base64 ($digest);
+
+  my $word_packed = pack_if_HEX_notation ($word);
+
+  my $new_hash = module_generate_hash ($word_packed, unpack ("H*", $salt), $signature, $m, $t, $p, length ($digest));
+
+  return ($new_hash, $word);
+}
+
+1;

From 8a91fccefd8811e8d1c02d04d98675ca507bec4e Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Wed, 2 Jul 2025 22:19:39 +0200
Subject: [PATCH 24/37] porting to metal and fix OpenCL bug on hc__shfl

---
 OpenCL/inc_hash_argon2.cl | 123 +++++++++++++++++++++-----------------
 OpenCL/inc_hash_argon2.h  |  38 +++++++-----
 OpenCL/m34000-pure.cl     |   6 +-
 3 files changed, 98 insertions(+), 69 deletions(-)

diff --git a/OpenCL/inc_hash_argon2.cl b/OpenCL/inc_hash_argon2.cl
index dc575f8da..c87179d2c 100644
--- a/OpenCL/inc_hash_argon2.cl
+++ b/OpenCL/inc_hash_argon2.cl
@@ -12,7 +12,7 @@
 #include "inc_hash_blake2b.h"
 #include "inc_hash_argon2.h"
 
-DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
+DECLSPEC void argon2_initial_block (PRIVATE_AS const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
 {
   blake2b_ctx_t ctx;
 
@@ -22,12 +22,12 @@ DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blo
 
   blake_buf[0] = sizeof(argon2_block_t);
 
-  blake2b_update (&ctx, (u32 *) blake_buf, 4);
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 4);
   blake2b_update (&ctx, in, 64);
 
   blake_buf[0] = hl32_to_64 (lane, blocknum);
 
-  blake2b_update (&ctx, (u32 *) blake_buf, 8);
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 8);
 
   blake2b_final (&ctx);
 
@@ -57,7 +57,7 @@ DECLSPEC void argon2_initial_block (const u32 *in, const u32 lane, const u32 blo
   out[127] = ctx.h[7];
 }
 
-DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, u64 *blockhash)
+DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u64 *blockhash)
 {
   blake2b_ctx_t ctx;
   blake2b_init (&ctx);
@@ -96,7 +96,7 @@ DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const sal
 }
 
 DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
-                           const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
+                           PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
 {
   u64 blockhash[16] = { 0 };
 
@@ -105,8 +105,8 @@ DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *sal
   // Generate the first two blocks of each lane
   for (u32 lane = 0; lane < options->parallelism; lane++)
   {
-    argon2_initial_block ((u32 *) blockhash, lane, 0, options->parallelism, out);
-    argon2_initial_block ((u32 *) blockhash, lane, 1, options->parallelism, out);
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 0, options->parallelism, out);
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 1, options->parallelism, out);
   }
 }
 
@@ -118,11 +118,11 @@ DECLSPEC u64 trunc_mul (u64 x, u64 y)
   return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
 }
 
-DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const argon2_pos_t *pos, u32 index, u64 pseudo_random)
+DECLSPEC inline u32 argon2_ref_address (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, u32 index, u64 pseudo_random)
 {
-  u32 ref_lane;
-  u32 ref_area;
-  u32 ref_index;
+  u32 ref_lane = 0;
+  u32 ref_area = 0;
+  u32 ref_index = 0;
 
   if ((pos->pass == 0) && (pos->slice == 0))
   {
@@ -134,6 +134,7 @@ DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const a
   }
 
   ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
+
   ref_area *= options->segment_length;
 
   if ((ref_lane == pos->lane) || (index == 0))
@@ -141,7 +142,10 @@ DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const a
       ref_area += (index - 1);
   }
 
+  // if ref_area == 0xFFFFFFFF => bug
+
   const u32 j1 = l32_from_64_S (pseudo_random);
+
   ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
 
   if (pos->pass > 0)
@@ -157,68 +161,79 @@ DECLSPEC inline u32 argon2_ref_address (const argon2_options_t *options, const a
   return (options->parallelism * ref_index) + ref_lane;
 }
 
-DECLSPEC void swap_u64 (u64 *x, u64 *y)
+DECLSPEC void swap_u64 (PRIVATE_AS u64 *x, PRIVATE_AS u64 *y)
 {
   u64 tmp = *x;
   *x = *y;
   *y = tmp;
 }
 
-DECLSPEC void transpose_permute_block (u64 R[4], int thread)
+DECLSPEC void transpose_permute_block (u64 R[4], int argon2_thread)
 {
-  if (thread & 0x08)
+  if (argon2_thread & 0x08)
   {
     swap_u64 (&R[0], &R[2]);
     swap_u64 (&R[1], &R[3]);
   }
-  if (thread & 0x04)
+  if (argon2_thread & 0x04)
   {
     swap_u64 (&R[0], &R[1]);
     swap_u64 (&R[2], &R[3]);
   }
 }
 
-DECLSPEC int argon2_shift (int idx, int thread)
+DECLSPEC int argon2_shift (int idx, int argon2_thread)
 {
   const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
-  return (thread & 0x0e) | (((thread & 0x11) + delta + 0x0e) & 0x11);
+  return (argon2_thread & 0x0e) | (((argon2_thread & 0x11) + delta + 0x0e) & 0x11);
 }
 
-DECLSPEC void argon2_hash_block (u64 R[4], int thread, LOCAL_AS u64 *shuffle_buf)
+DECLSPEC void argon2_hash_block (u64 R[4], int argon2_thread, LOCAL_AS u64 *shuffle_buf, int argon2_lsz)
 {
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
 
-  transpose_permute_block (R, thread);
+  transpose_permute_block (R, argon2_thread);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
 
   ARGON2_G(R[0], R[1], R[2], R[3]);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx],  (thread & 0x1c) | ((thread + idx) & 0x03));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx],  (argon2_thread & 0x1c) | ((argon2_thread + idx) & 0x03), argon2_thread, argon2_lsz);
 
   ARGON2_G(R[0], R[1], R[2], R[3]);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], ((thread & 0x1c) | ((thread - idx) & 0x03)) ^ (idx << 2));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], ((argon2_thread & 0x1c) | ((argon2_thread - idx) & 0x03)) ^ (idx << 2), argon2_thread, argon2_lsz);
 
-  transpose_permute_block (R, thread);
+  transpose_permute_block (R, argon2_thread);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], thread ^ (idx << 2));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
 
   ARGON2_G(R[0], R[1], R[2], R[3]);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift (idx, thread));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift (idx, argon2_thread), argon2_thread, argon2_lsz);
 
   ARGON2_G(R[0], R[1], R[2], R[3]);
 
-  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift ((4 - idx), thread));
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift ((4 - idx), argon2_thread), argon2_thread, argon2_lsz);
 }
 
-DECLSPEC void argon2_next_addresses (const argon2_options_t *options, const argon2_pos_t *pos, u32 *addresses, u32 start_index, u32 thread, LOCAL_AS u64 *shuffle_buf)
+DECLSPEC void argon2_next_addresses (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, PRIVATE_AS u32 *addresses, u32 start_index, u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
 {
-  u64 Z[4] = { 0 };
+  u64 Z[4];
+
+  Z[0] = 0;
+  Z[1] = 0;
+  Z[2] = 0;
+  Z[3] = 0;
+
   u64 tmp[4];
 
-  switch (thread)
+  tmp[0] = 0;
+  tmp[1] = 0;
+  tmp[2] = 0;
+  tmp[3] = 0;
+
+  switch (argon2_thread)
   {
     case 0:  Z[0] = pos->pass;                   break;
     case 1:  Z[0] = pos->lane;                   break;
@@ -232,20 +247,22 @@ DECLSPEC void argon2_next_addresses (const argon2_options_t *options, const argo
 
   tmp[0] = Z[0];
 
-  argon2_hash_block (Z, thread, shuffle_buf);
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
 
   Z[0]  ^= tmp[0];
 
   for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
 
-  argon2_hash_block (Z, thread, shuffle_buf);
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
 
   for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
 
-  for (u32 i = 0, index = (start_index + thread); i < 4; i++, index += THREADS_PER_LANE)
+  for (u32 i = 0, index = (start_index + argon2_thread); i < 4; i++, index += THREADS_PER_LANE)
   {
     addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
   }
+
+  // if addresses[0] == 0xFFFFFFFE => bug
 }
 
 DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
@@ -265,20 +282,20 @@ DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
   return -1;
 }
 
-DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 thread)
+DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 argon2_thread)
 {
   // Apply wrap-around to previous block index if the current block is the first block in the lane
   const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
 
   GLOBAL_AS argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
 
-  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + thread];
+  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
 
   return &blocks[(index_in_lane * options->parallelism) + lane];
 }
 
-DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
-                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 thread, LOCAL_AS u64 *shuffle_buf)
+DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
+                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
 {
   for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
   {
@@ -287,12 +304,12 @@ DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const ar
     if (indep_addr)
     {
       ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
-      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, index);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, index, argon2_thread, argon2_lsz);
     }
     else
     {
       ref_address = argon2_ref_address (options, pos, index, R[0]);
-      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, 0);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, 0, argon2_thread, argon2_lsz);
     }
 
     GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
@@ -302,32 +319,30 @@ DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, const ar
     // First pass is overwrite, next passes are XOR with previous
     if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
     {
-      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + thread];
+      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
     }
 
-    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + thread];
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
 
     for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
 
-    argon2_hash_block (R, thread, shuffle_buf);
+    argon2_hash_block (R, argon2_thread, shuffle_buf, argon2_lsz);
 
     for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
 
-    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + thread] = R[idx];
+    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread] = R[idx];
   }
 }
 
-DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf)
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz)
 {
-  const u32  thread       = get_local_id(0);
-
   // We have already generated the first two blocks of each lane (for the first pass)
   const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
   const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
 
   u64 R[4];
 
-  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, thread);
+  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, argon2_thread);
 
   if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
   {
@@ -336,10 +351,10 @@ DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon
       const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
       const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
 
-      u32 addresses[4];
+      u32 addresses[4] = { 0, 0, 0, 0 };
 
-      argon2_next_addresses (options, pos, addresses, block_index, thread, shuffle_buf);
-      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, thread, shuffle_buf);
+      argon2_next_addresses (options, pos, addresses, block_index, argon2_thread, shuffle_buf, argon2_lsz);
+      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
 
       cur_block += (end_index - start_index) * options->parallelism;
     }
@@ -348,11 +363,11 @@ DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon
   {
     u32 addresses[4] = { 0 };
 
-    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, thread, shuffle_buf);
+    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
   }
 }
 
-DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out)
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out)
 {
   const u32 lane_length = options->lane_length;
   const u32 lanes = options->parallelism;
@@ -374,11 +389,11 @@ DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_optio
   ctx.h[0] ^= 0x40 ^ options->digest_len; 
 
   blake2b_update (&ctx, output_len, 4);
-  blake2b_update (&ctx, (u32 *) final_block.values, sizeof(final_block));
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) final_block.values, sizeof(final_block));
 
   blake2b_final (&ctx);
 
-  for (int i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
+  for (uint i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
   {
     out [i + 0] = l32_from_64_S (ctx.h[idx]);
     out [i + 1] = h32_from_64_S (ctx.h[idx]);
diff --git a/OpenCL/inc_hash_argon2.h b/OpenCL/inc_hash_argon2.h
index 922afc44a..a03b7c480 100644
--- a/OpenCL/inc_hash_argon2.h
+++ b/OpenCL/inc_hash_argon2.h
@@ -1,3 +1,4 @@
+
 /**
  * Author......: Netherlands Forensic Institute
  * License.....: MIT
@@ -23,15 +24,15 @@
 #define TYPE_ID 2
 
 #if defined IS_CUDA
-#define hc__shfl_sync(shfbuf,mask,var,srcLane) __shfl_sync ((mask),(var),(srcLane))
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl_sync ((mask),(var),(srcLane))
 #elif defined IS_HIP
 // attention hard coded 32 warps for hip here
-#define hc__shfl_sync(shfbuf,mask,var,srcLane) __shfl ((var),(srcLane),32)
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl ((var),(srcLane),32)
 #elif defined IS_OPENCL
-#define hc__shfl_sync(shfbuf,mask,var,srcLane) hc__shfl ((shfbuf),(var),(srcLane))
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) hc__shfl ((shfbuf),(var),(srcLane),(argon2_thread),(argon2_lsz))
 
 #if defined IS_AMD && defined IS_GPU
-DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
 {
   const u32 idx = src_lane << 2;
 
@@ -46,7 +47,7 @@ DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, co
   return out;
 }
 #elif defined IS_NV && defined IS_GPU
-DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
 {
   const u32 l32 = l32_from_64_S (var);
   const u32 h32 = h32_from_64_S (var);
@@ -67,22 +68,31 @@ DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, co
   return out;
 }
 #else
-DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane)
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
 {
-  const u32 lid = get_local_id (0);
-
-  shuffle_buf[lid] = var;
+  shuffle_buf[argon2_thread] = var;
 
   barrier (CLK_LOCAL_MEM_FENCE);
 
-  const u64 out = shuffle_buf[src_lane & 31];
+  const u64 out = shuffle_buf[src_lane & (argon2_lsz - 1)];
 
   return out;
 }
 #endif
 
 #elif defined IS_METAL
-//todo
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) hc__shfl ((shfbuf),(var),(srcLane),(argon2_thread),(argon2_lsz))
+
+DECLSPEC u64 hc__shfl (LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  shuffle_buf[argon2_thread] = var;
+
+  SYNC_THREADS();
+
+  const u64 out = shuffle_buf[src_lane & (argon2_lsz - 1)];
+
+  return out;
+}
 #endif
 
 #define ARGON2_G(a,b,c,d)                \
@@ -140,8 +150,8 @@ typedef struct argon2_pos
 
 } argon2_pos_t;
 
-DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
-DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf);
-DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, const argon2_options_t *options, u32 *out);
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz);
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out);
 
 #endif // INC_HASH_ARGON2_H
diff --git a/OpenCL/m34000-pure.cl b/OpenCL/m34000-pure.cl
index 483adf59c..6fba3590c 100644
--- a/OpenCL/m34000-pure.cl
+++ b/OpenCL/m34000-pure.cl
@@ -1,3 +1,4 @@
+
 /**
  * Author......: Netherlands Forensic Institute
  * License.....: MIT
@@ -63,6 +64,9 @@ KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2
 
   if (bid >= GID_CNT) return;
 
+  const u32 argon2_thread = get_local_id (0);
+  const u32 argon2_lsz = get_local_size (0);
+
   LOCAL_VK u64 shuffle_bufs[ARGON2_PARALLELISM][32];
   LOCAL_AS u64 *shuffle_buf = shuffle_bufs[lid];
 
@@ -94,7 +98,7 @@ KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2
   {
     for (pos.lane = lid; pos.lane < options.parallelism; pos.lane += lsz)
     {
-      argon2_fill_segment (argon2_extra->blocks, &options, &pos, shuffle_buf);
+      argon2_fill_segment (argon2_extra->blocks, &options, &pos, shuffle_buf, argon2_thread, argon2_lsz);
     }
 
     SYNC_THREADS ();

From e8cf8bd146a8678c0d85b6b63180a61cf7a3da99 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Thu, 3 Jul 2025 08:10:30 +0200
Subject: [PATCH 25/37] Fix OpenCL spawning unnecessary work-item due to
 redundant multiplication in new OPTS_TYPE_THREAD_MULTI_DISABLE mode. Prepare
 Metal section in run_kernel() for 2D kernel invocation related to new
 salt->salt_dimy variable. Move reusable Argon2 module code into separate file
 argon2_commit.c, similar to scrypt_commit.c, and update headers. Update
 existing hash mode 34000 to use argon2_commit.c.

---
 src/backend.c               |  43 +++++++--
 src/modules/argon2_common.c | 174 +++++++++++++++++++++++++++++++++++
 src/modules/module_34000.c  | 175 ++----------------------------------
 3 files changed, 221 insertions(+), 171 deletions(-)
 create mode 100644 src/modules/argon2_common.c

diff --git a/src/backend.c b/src/backend.c
index c10cf769f..74b6b96e9 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2860,7 +2860,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2898,10 +2901,37 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }
+    else
+    {
+      num_elements = num_elements * kernel_threads;
+    }
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    size_t global_work_size[3] = { num_elements,   1, 1 };
+    size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    cl_uint work_dim = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
 
     double ms = 0;
 
@@ -2997,7 +3027,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
     }
     */
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    }
 
     cl_event opencl_event;
 
diff --git a/src/modules/argon2_common.c b/src/modules/argon2_common.c
new file mode 100644
index 000000000..cfe47071e
--- /dev/null
+++ b/src/modules/argon2_common.c
@@ -0,0 +1,174 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include <inttypes.h>
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+#include "memory.h"
+
+#define ARGON2_SYNC_POINTS  4
+#define ARGON2_BLOCK_SIZE   1024
+
+typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+
+  u32 digest_len;
+
+} argon2_options_t;
+
+u32 argon2_module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = 32; // hard-coded in kernel
+
+  return kernel_threads_min;
+}
+
+u32 argon2_module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_max = 32; // hard-coded in kernel
+
+  return kernel_threads_max;
+}
+
+u64 argon2_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 tmp_size = 0; // we'll add some later
+
+  return tmp_size;
+}
+
+const char *argon2_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
+{
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  int   lines_sz  = 4096;
+  char *lines_buf = hcmalloc (lines_sz);
+  int   lines_pos = 0;
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
+
+  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
+
+  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel_user)
+  {
+    kernel_accel_new = kernel_accel_user;
+  }
+  else
+  {
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      kernel_accel_new = available_mem / size_per_accel;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
+  return lines_buf;
+}
+
+u64 argon2_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
+
+  return size_argon2;
+}
+
+u64 argon2_module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
+
+  for (u32 i = 1; i < hashes->salts_cnt; i++)
+  {
+    if ((memory_block_count != options[i].memory_block_count)
+     || (parallelism        != options[i].parallelism))
+    {
+      return (1ULL << 63) + i;
+    }
+  }
+
+  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
+
+  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
+  {
+    if ((memory_block_count != options_st->memory_block_count)
+     || (parallelism        != options_st->parallelism))
+    {
+      return (1ULL << 62);
+    }
+  }
+
+  u64 tmp_size = sizeof (argon2_tmp_t);
+
+  return tmp_size;
+}
+
+char *argon2_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  char *jit_build_options = NULL;
+
+  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
+
+  return jit_build_options;
+}
+
diff --git a/src/modules/module_34000.c b/src/modules/module_34000.c
index 60a37358c..34b621133 100644
--- a/src/modules/module_34000.c
+++ b/src/modules/module_34000.c
@@ -11,9 +11,6 @@
 #include "shared.h"
 #include "memory.h"
 
-#define ARGON2_SYNC_POINTS  4
-#define ARGON2_BLOCK_SIZE   1024
-
 static const u32   ATTACK_EXEC    = ATTACK_EXEC_OUTSIDE_KERNEL;
 static const u32   DGST_POS0      = 0;
 static const u32   DGST_POS1      = 1;
@@ -48,28 +45,7 @@ u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig,
 const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
 const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
 
-typedef struct argon2_tmp
-{
-  u32 state[4]; // just something for now
-
-} argon2_tmp_t;
-
-typedef struct argon2_options
-{
-  u32 type;
-  u32 version;
-
-  u32 iterations;
-  u32 parallelism;
-  u32 memory_usage_in_kib;
-
-  u32 segment_length;
-  u32 lane_length;
-  u32 memory_block_count;
-
-  u32 digest_len;
-
-} argon2_options_t;
+#include "argon2_common.c"
 
 static const char *SIGNATURE_ARGON2D  = "$argon2d$";
 static const char *SIGNATURE_ARGON2I  = "$argon2i$";
@@ -82,140 +58,6 @@ u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED
   return esalt_size;
 }
 
-u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_min = 32; // hard-coded in kernel
-
-  return kernel_threads_min;
-}
-
-u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u32 kernel_threads_max = 32; // hard-coded in kernel
-
-  return kernel_threads_max;
-}
-
-u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
-{
-  const u64 tmp_size = 0; // we'll add some later
-
-  return tmp_size;
-}
-
-const char *module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
-{
-  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
-
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-
-  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
-
-  int   lines_sz  = 4096;
-  char *lines_buf = hcmalloc (lines_sz);
-  int   lines_pos = 0;
-
-  const u32 device_processors = device_param->device_processors;
-
-  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
-
-  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
-
-  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
-
-  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
-
-  u32 kernel_accel_new = device_processors;
-
-  if (kernel_accel_user)
-  {
-    kernel_accel_new = kernel_accel_user;
-  }
-  else
-  {
-    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
-    {
-      kernel_accel_new = available_mem / size_per_accel;
-
-      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
-    }
-  }
-
-  char *new_device_name = hcstrdup (device_param->device_name);
-
-  for (size_t i = 0; i < strlen (new_device_name); i++)
-  {
-    if (new_device_name[i] == ' ') new_device_name[i] = '_';
-  }
-
-  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
-
-  hcfree (new_device_name);
-
-  return lines_buf;
-}
-
-u64 module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-
-  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
-
-  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
-
-  return size_argon2;
-}
-
-u64 module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
-{
-  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
-  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
-
-  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
-  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
-
-  for (u32 i = 1; i < hashes->salts_cnt; i++)
-  {
-    if ((memory_block_count != options[i].memory_block_count)
-     || (parallelism        != options[i].parallelism))
-    {
-      return (1ULL << 63) + i;
-    }
-  }
-
-  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
-
-  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
-  {
-    if ((memory_block_count != options_st->memory_block_count)
-     || (parallelism        != options_st->parallelism))
-    {
-      return (1ULL << 62);
-    }
-  }
-
-  u64 tmp_size = sizeof (argon2_tmp_t);
-
-  return tmp_size;
-}
-
-char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
-
-  char *jit_build_options = NULL;
-
-  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
-
-  return jit_build_options;
-}
-
 int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
 {
   u32 *digest = (u32 *) digest_buf;
@@ -393,9 +235,9 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_dgst_size                = module_dgst_size;
   module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
   module_ctx->module_esalt_size               = module_esalt_size;
-  module_ctx->module_extra_buffer_size        = module_extra_buffer_size;
-  module_ctx->module_extra_tmp_size           = module_extra_tmp_size;
-  module_ctx->module_extra_tuningdb_block     = module_extra_tuningdb_block;
+  module_ctx->module_extra_buffer_size        = argon2_module_extra_buffer_size;
+  module_ctx->module_extra_tmp_size           = argon2_module_extra_tmp_size;
+  module_ctx->module_extra_tuningdb_block     = argon2_module_extra_tuningdb_block;
   module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
   module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
   module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
@@ -421,14 +263,14 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_hook23                   = MODULE_DEFAULT;
   module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
   module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_build_options        = argon2_module_jit_build_options;
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
-  module_ctx->module_kernel_threads_max       = module_kernel_threads_max;
-  module_ctx->module_kernel_threads_min       = module_kernel_threads_min;
+  module_ctx->module_kernel_threads_max       = argon2_module_kernel_threads_max;
+  module_ctx->module_kernel_threads_min       = argon2_module_kernel_threads_min;
   module_ctx->module_kern_type                = module_kern_type;
   module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
   module_ctx->module_opti_type                = module_opti_type;
@@ -447,7 +289,8 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_separator                = MODULE_DEFAULT;
   module_ctx->module_st_hash                  = module_st_hash;
   module_ctx->module_st_pass                  = module_st_pass;
-  module_ctx->module_tmp_size                 = module_tmp_size;
+  module_ctx->module_tmp_size                 = argon2_module_tmp_size;
   module_ctx->module_unstable_warning         = MODULE_DEFAULT;
   module_ctx->module_warmup_disable           = MODULE_DEFAULT;
 }
+

From 4d39f881fd2f501b2b0251fc5259bae123be7e99 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Thu, 3 Jul 2025 10:26:51 +0200
Subject: [PATCH 26/37] support 2D/3D kernel invocation with Metal

---
 include/ext_metal.h |  2 +-
 src/backend.c       | 23 +++++++++--------------
 src/ext_metal.m     |  6 +++---
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/include/ext_metal.h b/include/ext_metal.h
index a7a7d37a9..85facc62b 100644
--- a/include/ext_metal.h
+++ b/include/ext_metal.h
@@ -111,7 +111,7 @@ int  hc_mtlCreateLibraryWithFile    (void *hashcat_ctx, mtl_device_id metal_devi
 int  hc_mtlEncodeComputeCommand_pre (void *hashcat_ctx, mtl_pipeline metal_pipeline, mtl_command_queue metal_command_queue, mtl_command_buffer *metal_command_buffer, mtl_command_encoder *metal_command_encoder);
 int  hc_mtlSetCommandEncoderArg     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, size_t off, size_t idx, mtl_mem buf, void *host_data, size_t host_data_size);
 
-int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms);
+int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);
 
 #endif // __APPLE__
 
diff --git a/src/backend.c b/src/backend.c
index 74b6b96e9..4fa01abb9 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2206,7 +2206,7 @@ int run_metal_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2234,7 +2234,7 @@ int run_metal_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2265,7 +2265,7 @@ int run_metal_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
   }
 
   if (num16m)
@@ -2913,31 +2913,26 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
     size_t global_work_size[3] = { num_elements,   1, 1 };
     size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
-    cl_uint work_dim = 1;
-
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
-      work_dim = 2;
     }
 
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
-      work_dim = 2;
     }
 
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
-      work_dim = 2;
     }
 
     double ms = 0;
 
     if (is_autotune == true)
     {
-      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
 
       // hc_mtlEncodeComputeCommand_pre() must be called before every hc_mtlEncodeComputeCommand()
       if (hc_mtlEncodeComputeCommand_pre (hashcat_ctx, metal_pipeline, device_param->metal_command_queue, &metal_command_buffer, &metal_command_encoder) == -1) return -1;
@@ -2956,7 +2951,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
 
     if (rc_cc != -1)
     {
@@ -3349,7 +3344,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3440,7 +3435,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3524,7 +3519,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
 
     // release tmp_buf
 
@@ -3604,7 +3599,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
diff --git a/src/ext_metal.m b/src/ext_metal.m
index 6a51d48bd..cd23fb062 100644
--- a/src/ext_metal.m
+++ b/src/ext_metal.m
@@ -1314,10 +1314,10 @@ int hc_mtlSetCommandEncoderArg (void *hashcat_ctx, mtl_command_encoder metal_com
   return 0;
 }
 
-int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms)
+int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
 {
-  MTLSize numThreadgroups = {local_work_size, 1, 1};
-  MTLSize threadsGroup = {global_work_size, 1, 1};
+  MTLSize numThreadgroups = {local_work_size[0], local_work_size[1], local_work_size[2]};
+  MTLSize threadsGroup = {global_work_size[0], global_work_size[1], global_work_size[2]};
 
   if (metal_command_encoder == nil)
   {

From bcc351068ffc0949a88a9c6c38b09b60f0c0897c Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Thu, 3 Jul 2025 22:06:32 +0200
Subject: [PATCH 27/37] Metal Backend: - added support to 2D/3D Compute -
 improved compute workloads calculation Makefile: - updated
 MACOSX_DEPLOYMENT_TARGET to 15.0 Unit tests: - updated install_modules.sh
 with Crypt::Argon2

Argon2 start works with Apple Metal
---
 OpenCL/inc_amp.h         |  2 +-
 OpenCL/inc_common.h      |  8 +++---
 OpenCL/inc_markov.h      |  6 ++---
 OpenCL/inc_platform.h    | 23 ++++++++++++++---
 OpenCL/inc_shared.h      | 10 ++++----
 docs/changes.txt         |  4 +++
 include/ext_metal.h      |  2 +-
 src/Makefile             |  2 +-
 src/backend.c            | 23 ++++++++++-------
 src/ext_metal.m          | 53 ++++++++++++++++++++++++++++++----------
 tools/install_modules.sh |  1 +
 11 files changed, 93 insertions(+), 41 deletions(-)

diff --git a/OpenCL/inc_amp.h b/OpenCL/inc_amp.h
index 5db6a21d8..e49b23959 100644
--- a/OpenCL/inc_amp.h
+++ b/OpenCL/inc_amp.h
@@ -16,7 +16,7 @@
   GLOBAL_AS   const bf_t          *bfs_buf,    \
   CONSTANT_AS const u32           &combs_mode, \
   CONSTANT_AS const u64           &gid_max,    \
-                    uint           hc_gid [[ thread_position_in_grid ]]
+                    uint3          hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index c24ecb524..d0b2ed989 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -124,10 +124,10 @@
 
 #if defined IS_METAL
 #define KERN_ATTR_MAIN_PARAMS                       \
-  uint hc_gid [[ thread_position_in_grid ]],        \
-  uint hc_lid [[ thread_position_in_threadgroup ]], \
-  uint hc_lsz [[ threads_per_threadgroup ]],        \
-  uint hc_bid [[ threadgroup_position_in_grid ]]
+  uint3 hc_gid [[ thread_position_in_grid ]],        \
+  uint3 hc_lid [[ thread_position_in_threadgroup ]], \
+  uint3 hc_lsz [[ threads_per_threadgroup ]],        \
+  uint3 hc_bid [[ threadgroup_position_in_grid ]]
 #endif // IS_METAL
 
 /*
diff --git a/OpenCL/inc_markov.h b/OpenCL/inc_markov.h
index 3aae8f7fc..1ec187b8e 100644
--- a/OpenCL/inc_markov.h
+++ b/OpenCL/inc_markov.h
@@ -19,7 +19,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_R_MARKOV                \
   GLOBAL_AS         bf_t *pws_buf_r,      \
@@ -31,7 +31,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_C_MARKOV                \
   GLOBAL_AS         pw_t *pws_buf,        \
@@ -43,7 +43,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index e1ffdefcf..9729d4fad 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -73,10 +73,25 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
-#define get_global_id(param) hc_gid
-#define get_local_id(param) hc_lid
-#define get_local_size(param) hc_lsz
-#define get_group_id(param) hc_bid
+#define get_global_id(dimindx)   \
+  ((dimindx) == 0 ? hc_gid.x :   \
+   (dimindx) == 1 ? hc_gid.y :   \
+   (dimindx) == 2 ? hc_gid.z : -1)
+
+#define get_group_id(dimindx)    \
+  ((dimindx) == 0 ? hc_bid.x :   \
+   (dimindx) == 1 ? hc_bid.y :   \
+   (dimindx) == 2 ? hc_bid.z : -1)
+
+#define get_local_id(dimindx)    \
+  ((dimindx) == 0 ? hc_lid.x :   \
+   (dimindx) == 1 ? hc_lid.y :   \
+   (dimindx) == 2 ? hc_lid.z : -1)
+
+#define get_local_size(dimindx)  \
+  ((dimindx) == 0 ? hc_lsz.x :   \
+   (dimindx) == 1 ? hc_lsz.y :   \
+   (dimindx) == 2 ? hc_lsz.z : -1)
 
 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
diff --git a/OpenCL/inc_shared.h b/OpenCL/inc_shared.h
index 16f2e2c4c..6518c30c6 100644
--- a/OpenCL/inc_shared.h
+++ b/OpenCL/inc_shared.h
@@ -13,28 +13,28 @@
   GLOBAL_AS         u32      *pws_comp, \
   GLOBAL_AS         pw_t     *pws_buf,  \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_MEMSET            \
   GLOBAL_AS         uint4    *buf,      \
   CONSTANT_AS const u32      &value,    \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_BZERO             \
   GLOBAL_AS         uint4    *buf,      \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_ATINIT            \
   GLOBAL_AS         pw_t     *buf,      \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_UTF8_TO_UTF16     \
   GLOBAL_AS         pw_t     *pws_buf,  \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/docs/changes.txt b/docs/changes.txt
index 06e1e46bb..cfdba32b4 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -148,19 +148,23 @@
 - Status Code: Add specific return code for self-test fail (-11)
 - Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
 - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
+- Unit tests: Updated install_modules.sh with Crypt::Argon2
 - User Options: Added error message when mixing --username and --show to warn users of exponential delay
 - MetaMask: update extraction tool to support MetaMask Mobile wallets
 - SecureCRT MasterPassphrase v2: update module, pure kernels and test unit. Add optimized kernels.
 - Metal Backend: added workaround to prevent 'Infinite Loop' bug when build kernels
 - Metal Backend: added workaround to set the true Processor value in Metal devices on Apple Intel
+- Metal Backend: added support to 2D/3D Compute
 - Metal Backend: allow use of devices with Metal if runtime version is >= 200
 - Metal Backend: disable Metal devices only if at least one OpenCL device is active
+- Metal Backend: improved compute workloads calculation
 - Modules: Check UnpackSize to raise false positive with hc_decompress_rar
 - User Options: added --metal-compiler-runtime option
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
 - Hardware Monitor: Splitting hwmon_ctx_init function into smaller library-specific functions
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
+- Makefile: updated MACOSX_DEPLOYMENT_TARGET to 15.0
 - Rules: Rename best64.rule to best66.rule and remove the unknown section from it
 
 * changes v6.2.5 -> v6.2.6
diff --git a/include/ext_metal.h b/include/ext_metal.h
index 85facc62b..b51f09ff2 100644
--- a/include/ext_metal.h
+++ b/include/ext_metal.h
@@ -111,7 +111,7 @@ int  hc_mtlCreateLibraryWithFile    (void *hashcat_ctx, mtl_device_id metal_devi
 int  hc_mtlEncodeComputeCommand_pre (void *hashcat_ctx, mtl_pipeline metal_pipeline, mtl_command_queue metal_command_queue, mtl_command_buffer *metal_command_buffer, mtl_command_encoder *metal_command_encoder);
 int  hc_mtlSetCommandEncoderArg     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, size_t off, size_t idx, mtl_mem buf, void *host_data, size_t host_data_size);
 
-int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);
+int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);
 
 #endif // __APPLE__
 
diff --git a/src/Makefile b/src/Makefile
index 2ed479330..2cfbf9406 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -358,7 +358,7 @@ LFLAGS_NATIVE           += -lpthread
 endif # NetBSD
 
 ifeq ($(UNAME),Darwin)
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=15.0
 CFLAGS_NATIVE           := $(CFLAGS)
 CFLAGS_NATIVE           += -DWITH_HWMON
 
diff --git a/src/backend.c b/src/backend.c
index 4fa01abb9..3109f8918 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -2206,7 +2206,7 @@ int run_metal_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2234,7 +2234,7 @@ int run_metal_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2265,7 +2265,7 @@ int run_metal_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
 
   if (num16m)
@@ -2910,29 +2910,34 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       num_elements = num_elements * kernel_threads;
     }
 
+    unsigned int work_dim = 1;
+
     size_t global_work_size[3] = { num_elements,   1, 1 };
     size_t local_work_size[3]  = { kernel_threads, 1, 1 };
 
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
     }
 
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
     }
 
     if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
     {
       global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
     }
 
     double ms = 0;
 
     if (is_autotune == true)
     {
-      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);
 
       // hc_mtlEncodeComputeCommand_pre() must be called before every hc_mtlEncodeComputeCommand()
       if (hc_mtlEncodeComputeCommand_pre (hashcat_ctx, metal_pipeline, device_param->metal_command_queue, &metal_command_buffer, &metal_command_encoder) == -1) return -1;
@@ -2951,7 +2956,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);
 
     if (rc_cc != -1)
     {
@@ -3344,7 +3349,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3435,7 +3440,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3519,7 +3524,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms);
 
     // release tmp_buf
 
@@ -3599,7 +3604,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size, local_work_size, &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
diff --git a/src/ext_metal.m b/src/ext_metal.m
index cd23fb062..a05c1a0e7 100644
--- a/src/ext_metal.m
+++ b/src/ext_metal.m
@@ -195,11 +195,14 @@ static int hc_mtlBuildOptionsToDict (void *hashcat_ctx, const char *build_option
   }
 
   // if set, add INCLUDE_PATH to hack Apple kernel build from source limitation on -I usage
+
   if (include_path != nil)
   {
     NSString *path_key = @"INCLUDE_PATH";
     NSString *path_value = [NSString stringWithCString: include_path encoding: NSUTF8StringEncoding];
+
     // Include path may contain spaces, escape them with a backslash
+
     path_value = [path_value stringByReplacingOccurrencesOfString:@" " withString:@"\\ "];
 
     [build_options_dict setObject:path_value forKey:path_key];
@@ -743,6 +746,7 @@ int hc_mtlCreateKernel (void *hashcat_ctx, mtl_device_id metal_device, mtl_libra
   dispatch_queue_t queue = dispatch_get_global_queue (DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
 
   // if no user-defined runtime, set to METAL_COMPILER_RUNTIME
+
   long timeout = (user_options->metal_compiler_runtime > 0) ? user_options->metal_compiler_runtime : METAL_COMPILER_RUNTIME;
 
   dispatch_time_t when = dispatch_time (DISPATCH_TIME_NOW,NSEC_PER_SEC * timeout);
@@ -1314,10 +1318,21 @@ int hc_mtlSetCommandEncoderArg (void *hashcat_ctx, mtl_command_encoder metal_com
   return 0;
 }
 
-int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
+int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
 {
-  MTLSize numThreadgroups = {local_work_size[0], local_work_size[1], local_work_size[2]};
-  MTLSize threadsGroup = {global_work_size[0], global_work_size[1], global_work_size[2]};
+  MTLSize threadsPerThreadgroup =
+  {
+    local_work_size[0],
+    local_work_size[1],
+    local_work_size[2]
+  };
+
+  MTLSize threadgroupsPerGrid =
+  {
+    (global_work_size[0] + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
+    work_dim > 1 ? (global_work_size[1] + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height : 1,
+    work_dim > 2 ? (global_work_size[2] + threadsPerThreadgroup.depth - 1) / threadsPerThreadgroup.depth : 1
+  };
 
   if (metal_command_encoder == nil)
   {
@@ -1333,7 +1348,7 @@ int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_com
     return -1;
   }
 
-  [metal_command_encoder dispatchThreadgroups: threadsGroup threadsPerThreadgroup: numThreadgroups];
+  [metal_command_encoder dispatchThreadgroups: threadgroupsPerGrid threadsPerThreadgroup: threadsPerThreadgroup];
 
   [metal_command_encoder endEncoding];
   [metal_command_buffer commit];
@@ -1377,17 +1392,22 @@ int hc_mtlCreateLibraryWithFile (void *hashcat_ctx, mtl_device_id metal_device,
 
   if (k_string != nil)
   {
-    id <MTLLibrary> r = [metal_device newLibraryWithFile: k_string error: &error];
+    NSURL *libURL = [NSURL fileURLWithPath: k_string];
 
-    if (error != nil)
+    if (libURL != nil)
     {
-      event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
-      return -1;
+      id <MTLLibrary> r = [metal_device newLibraryWithURL: libURL error:&error];
+
+      if (error != nil)
+      {
+        event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
+        return -1;
+      }
+
+      *metal_library = r;
+
+      return 0;
     }
-
-    *metal_library = r;
-
-    return 0;
   }
 
   return -1;
@@ -1420,10 +1440,17 @@ int hc_mtlCreateLibraryWithSource (void *hashcat_ctx, mtl_device_id metal_device
       }
 
       compileOptions.preprocessorMacros = build_options_dict;
+      /*
+      compileOptions.optimizationLevel = MTLLibraryOptimizationLevelSize;
+      compileOptions.mathMode = MTLMathModeSafe;
+      // compileOptions.mathMode = MTLMathModeRelaxed;
+      // compileOptions.enableLogging = true;
+      // compileOptions.fastMathEnabled = false;
+      */
     }
 
     // todo: detect current os version and choose the right
-//    compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
+    // compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
 /*
     if (@available(macOS 12.0, *))
     {
diff --git a/tools/install_modules.sh b/tools/install_modules.sh
index 683d20000..948856e34 100755
--- a/tools/install_modules.sh
+++ b/tools/install_modules.sh
@@ -18,6 +18,7 @@ cpan install Authen::Passphrase::LANManager \
              Bitcoin::Crypto::Base58        \
              Compress::Zlib                 \
              Convert::EBCDIC                \
+             Crypt::Argon2                  \
              Crypt::AuthEnc::GCM            \
              Crypt::Camellia                \
              Crypt::CBC                     \

From 80803e2ea5ccdf7b73350ac6a5c62816db864386 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Fri, 4 Jul 2025 08:04:44 +0200
Subject: [PATCH 28/37] fix -a9 by add missing get_global_id() in m34000_loop

---
 OpenCL/m34000-pure.cl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/OpenCL/m34000-pure.cl b/OpenCL/m34000-pure.cl
index 6fba3590c..ba87d835c 100644
--- a/OpenCL/m34000-pure.cl
+++ b/OpenCL/m34000-pure.cl
@@ -58,6 +58,7 @@ KERNEL_FQ KERNEL_FA void m34000_init (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2
 
 KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
 {
+  const u64 gid = get_global_id (0);
   const u64 bid = get_group_id (0);
   const u64 lid = get_local_id (1);
   const u64 lsz = get_local_size (1);

From 25b9e67470b8963ea76193467dfb306858e13be3 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Fri, 4 Jul 2025 08:09:56 +0200
Subject: [PATCH 29/37] make error messages on hashes_init_stage5() generic

---
 src/hashes.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/hashes.c b/src/hashes.c
index 72ab14433..40d94941f 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -2386,13 +2386,13 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)
 
         char *st_hash = strdup (tmp_buf);
 
-        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test configuration detected.");
 
         event_log_warning (hashcat_ctx, "The specified target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of the self-test hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of the self-test hash:");
         event_log_warning (hashcat_ctx, "  -> %s", st_hash);
-        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this SCRYPT configuration may be incompatible.");
+        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this configuration may be incompatible.");
         event_log_warning (hashcat_ctx, "You must disable the self-test functionality or recompile the plugin with a matching self-test hash.");
         event_log_warning (hashcat_ctx, "To disable the self-test, use the --self-test-disable option.");
         event_log_warning (hashcat_ctx, NULL);
@@ -2414,11 +2414,11 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)
 
         char *user_hash2 = strdup (tmp_buf);
 
-        event_log_error (hashcat_ctx, "ERROR: Mixed SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Mixed configuration detected.");
 
         event_log_warning (hashcat_ctx, "The specified target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of another target hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of another target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash2);
         event_log_warning (hashcat_ctx, "Please run these hashes in separate cracking sessions.");
         event_log_warning (hashcat_ctx, NULL);

From d3983edaf22f5370d1bb136c930460bcad2d9f6e Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Fri, 4 Jul 2025 21:51:32 +0200
Subject: [PATCH 30/37] Improved handling in get_opencl_kernel_wgs()

There are cases where we fix the thread count in a kernel using
FIXED_LOCAL_SIZE, but when the runtime loads the kernel binary, it
reports that it can only execute it with a different thread count.
According to the OpenCL specification, this can happen due to register
pressure.

However, we fix the thread count for a specific reason, and we choose to
accept potential register spilling to global memory. A warning is now
issued to inform the user about the runtime's suggested thread count,
allowing them to override it via the command line if they encounter
issues.

Also fixed the thread count for -m 10700 on NVIDIA's OpenCL, where 4
bytes are always lost for an unknown reason (similar to the issue seen
in bcrypt).
---
 src/backend.c              |  9 ++++++-
 src/modules/module_10700.c | 53 ++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/backend.c b/src/backend.c
index 3109f8918..e324a2e01 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -9336,7 +9336,14 @@ static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
 
   if (cwgs_total > 0)
   {
-    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
+    if (kernel_threads < cwgs_total)
+    {
+      // Very likely some bug, because the runtime was unable to follow our requirement to run N threads guaranteed on this kernel
+
+      event_log_warning (hashcat_ctx, "* Device #%u: Runtime returned CL_KERNEL_WORK_GROUP_SIZE=%d, but CL_KERNEL_COMPILE_WORK_GROUP_SIZE=%d. Use -T%d if you run into problems.", device_param->device_id + 1, (int) kernel_threads, (int) cwgs_total, (int) kernel_threads);
+    }
+
+    kernel_threads = cwgs_total;
   }
 
   *result = kernel_threads;
diff --git a/src/modules/module_10700.c b/src/modules/module_10700.c
index 4a7725e52..b0bdd4a41 100644
--- a/src/modules/module_10700.c
+++ b/src/modules/module_10700.c
@@ -152,38 +152,47 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 
 char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
+  const u32 shared_size_scratch = (32 + 64 + 16); // LOCAL_VK u32 s_sc[FIXED_LOCAL_SIZE][PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
+  const u32 shared_size_aes     = (5 * 1024);     // LOCAL_VK u32 s_te0[256];
+
   char *jit_build_options = NULL;
 
-  if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
-    u32 native_threads = 0;
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
+  }
+  else
+  {
+    u32 overhead = 0;
 
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
     {
-      native_threads = 1;
-    }
-    else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      #if defined (__APPLE__)
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
 
-      native_threads = 32;
-
-      #else
-
-      if (device_param->device_local_mem_size < 49152)
+      if (device_param->is_opencl == true)
       {
-        native_threads = MIN (device_param->kernel_preferred_wgs_multiple, 32); // We can't just set 32, because Intel GPU need 8
+        overhead = 1;
       }
-      else
-      {
-        // to go over 48KiB, we need to use dynamic shared mem
-        native_threads = 49152 / 128;
-      }
-
-      #endif
     }
 
-    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", native_threads);
+    const u32 device_local_mem_size = MIN (device_param->device_local_mem_size, 48*1024);
+
+    u32 fixed_local_size = ((device_local_mem_size - overhead) - shared_size_aes) / shared_size_scratch;
+
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+    }
+    else
+    {
+      if (fixed_local_size > device_param->kernel_preferred_wgs_multiple) fixed_local_size -= fixed_local_size % device_param->kernel_preferred_wgs_multiple;
+    }
+
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", fixed_local_size);
   }
 
   return jit_build_options;

From d2656e376d597515fa89e49b32c2d3a2121eb273 Mon Sep 17 00:00:00 2001
From: red <redongh@users.noreply.github.com>
Date: Sat, 5 Jul 2025 00:30:39 +0200
Subject: [PATCH 31/37] Update hashcat-python-plugin-development-guide.md

clarify location of custom Python scripts to be run without creating a dedicated module.
---
 docs/hashcat-python-plugin-development-guide.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/hashcat-python-plugin-development-guide.md b/docs/hashcat-python-plugin-development-guide.md
index 308b6d918..b93f2d66c 100644
--- a/docs/hashcat-python-plugin-development-guide.md
+++ b/docs/hashcat-python-plugin-development-guide.md
@@ -210,13 +210,13 @@ Notes:
 
 If you modify one of these plugin files, there's a trade-off: you won’t be able to contribute that code directly to the upstream Hashcat repository, since those files are meant to remain clean for demonstration purposes.
 
-To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. You can override the Python script to be loaded using `--bridge-parameter1`:
+To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. Using `--bridge-parameter1` allows you to override the Python script to be loaded:
 
 ```
-$ ./hashcat -m 73000 --bridge-parameter1 myimplementation.py hash.txt wordlist.txt ...
+$ ./hashcat -m 73000 --bridge-parameter1 ./Python/myimplementation.py hash.txt wordlist.txt ...
 ```
 
-This tells the Python bridge plugin to load `myimplementation.py` instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.
+This tells the Python bridge plugin to load `myimplementation.py` located in the local `Python` subdirectory instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.
 
 ### Design Tradeoffs and Format Considerations
 

From 381b2cac67d5d54d139baab318e7bc6a326b332d Mon Sep 17 00:00:00 2001
From: red <redongh@users.noreply.github.com>
Date: Sat, 5 Jul 2025 00:36:01 +0200
Subject: [PATCH 32/37] Update BUILD_WSL.md

Bump version of mingw-w64-x86_64-python-3.12.XX-X-any.pkg.tar.zst to latest.
---
 BUILD_WSL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BUILD_WSL.md b/BUILD_WSL.md
index cd5b6fd5b..957208381 100644
--- a/BUILD_WSL.md
+++ b/BUILD_WSL.md
@@ -19,7 +19,7 @@ cd win-iconv/
 cmake -D WIN_ICONV_BUILD_EXECUTABLE=OFF -D CMAKE_INSTALL_PREFIX=/opt/win-iconv-64 -D CMAKE_CXX_COMPILER=$(which x86_64-w64-mingw32-g++) -D CMAKE_C_COMPILER=$(which x86_64-w64-mingw32-gcc) -D CMAKE_SYSTEM_NAME=Windows
 sudo make install
 cd ../
-wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst
+wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.11-1-any.pkg.tar.zst
 sudo mkdir /opt/win-python
 sudo tar --zstd -xf mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst -C /opt/win-python
 ```

From 9457c62ef0d58114d7a7097f322063e9eabd0d8e Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sat, 5 Jul 2025 19:44:31 +0200
Subject: [PATCH 33/37] Removed redundant casts in inc_hash_blake2b.cl and
 inc_hash_blake2s.cl. Fixed parameter types in inc_hash_blake2b.cl and
 inc_hash_blake2s.cl for FINAL value. Added kernel code for -m 15400 to
 s04/s08/m04/m08, even if not needed, to help autotune find optimal workitem
 settings. Fixed a rare autotune case (e.g. in mode 18600) where threads_min
 was not a multiple of kernel_preferred_wgs_multiple, and changes it so that
 as long as it only threads_min is affected and not threads_max, we now ensure
 at least kernel_preferred_wgs_multiple. Improved autotune logic for best
 thread count: double thread count until reaching the device's preferred
 multiple, then increase in steps of that multiple while comparing efficiency
 vs. runtime, and select the configuration with best efficiency, not highest
 thread count. Always set funnelshift support to true for HIP devices, as it
 always reports false. Set minimum loop count to 250 for all VeraCrypt modes
 with PIM brute-force support.

---
 OpenCL/inc_hash_blake2b.cl    |  10 +-
 OpenCL/inc_hash_blake2s.cl    |  14 +-
 OpenCL/m15400_a3-optimized.cl | 333 +++++++++++++++++++++++++++++++++-
 OpenCL/m18600-pure.cl         |   2 +-
 src/autotune.c                |  41 ++++-
 src/backend.c                 |   2 +-
 src/modules/module_13711.c    |   9 +-
 src/modules/module_13712.c    |   9 +-
 src/modules/module_13713.c    |   9 +-
 src/modules/module_13721.c    |   9 +-
 src/modules/module_13722.c    |   9 +-
 src/modules/module_13723.c    |   9 +-
 src/modules/module_13731.c    |   9 +-
 src/modules/module_13732.c    |   9 +-
 src/modules/module_13733.c    |   9 +-
 src/modules/module_13751.c    |   9 +-
 src/modules/module_13752.c    |   9 +-
 src/modules/module_13753.c    |   9 +-
 src/modules/module_13771.c    |   9 +-
 src/modules/module_13772.c    |   9 +-
 src/modules/module_13773.c    |   9 +-
 src/modules/module_29411.c    |   9 +-
 src/modules/module_29412.c    |   9 +-
 src/modules/module_29413.c    |   9 +-
 src/modules/module_29421.c    |   9 +-
 src/modules/module_29422.c    |   9 +-
 src/modules/module_29423.c    |   9 +-
 src/modules/module_29431.c    |   9 +-
 src/modules/module_29432.c    |   9 +-
 src/modules/module_29433.c    |   9 +-
 src/modules/module_29451.c    |   9 +-
 src/modules/module_29452.c    |   9 +-
 src/modules/module_29453.c    |   9 +-
 src/modules/module_29471.c    |   9 +-
 src/modules/module_29472.c    |   9 +-
 src/modules/module_29473.c    |   9 +-
 36 files changed, 615 insertions(+), 57 deletions(-)

diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl
index 03a44b3ff..b205b18a7 100644
--- a/OpenCL/inc_hash_blake2b.cl
+++ b/OpenCL/inc_hash_blake2b.cl
@@ -409,7 +409,7 @@ DECLSPEC void blake2b_update (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS const u3
   u32 w6[4];
   u32 w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -499,7 +499,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co
   u32 w6[4];
   u32 w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -580,7 +580,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co
 
 DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx)
 {
-  blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL);
+  blake2b_transform (ctx->h, ctx->m, ctx->len, (u64) BLAKE2B_FINAL);
 }
 
 DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0)
@@ -813,7 +813,7 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA
   u32x w6[4];
   u32x w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -894,5 +894,5 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA
 
 DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
 {
-  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL);
+  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u64) BLAKE2B_FINAL);
 }
diff --git a/OpenCL/inc_hash_blake2s.cl b/OpenCL/inc_hash_blake2s.cl
index 77ecc02fb..99d4389fb 100644
--- a/OpenCL/inc_hash_blake2s.cl
+++ b/OpenCL/inc_hash_blake2s.cl
@@ -322,7 +322,7 @@ DECLSPEC void blake2s_update (PRIVATE_AS blake2s_ctx_t *ctx, PRIVATE_AS const u3
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -376,7 +376,7 @@ DECLSPEC void blake2s_update_global (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_AS co
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -516,7 +516,7 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -597,13 +597,11 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
   blake2s_update_64 (ctx, w0, w1, w2, w3, len - (u32) pos1);
 }
 
-
 DECLSPEC void blake2s_final (PRIVATE_AS blake2s_ctx_t *ctx)
 {
-  blake2s_transform (ctx->h, ctx->m, ctx->len, BLAKE2S_FINAL);
+  blake2s_transform (ctx->h, ctx->m, ctx->len, (u32) BLAKE2S_FINAL);
 }
 
-
 DECLSPEC void blake2s_hmac_init_64 (PRIVATE_AS blake2s_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3)
 {
   u32 a0[4];
@@ -1158,7 +1156,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA
   u32x w2[4];
   u32x w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -1207,7 +1205,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA
 
 DECLSPEC void blake2s_final_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx)
 {
-  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2S_FINAL);
+  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u32) BLAKE2S_FINAL);
 }
 
 DECLSPEC void blake2s_hmac_init_vector_64 (PRIVATE_AS blake2s_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3)
diff --git a/OpenCL/m15400_a3-optimized.cl b/OpenCL/m15400_a3-optimized.cl
index a0f856bac..f60635a1a 100644
--- a/OpenCL/m15400_a3-optimized.cl
+++ b/OpenCL/m15400_a3-optimized.cl
@@ -238,12 +238,162 @@ DECLSPEC void chacha20_transform (PRIVATE_AS const u32x *w0, PRIVATE_AS const u3
 
 KERNEL_FQ KERNEL_FA void m15400_m04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_m08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
@@ -328,12 +478,187 @@ KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 
 KERNEL_FQ KERNEL_FA void m15400_s04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }
 
+
 KERNEL_FQ KERNEL_FA void m15400_s08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_s16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl
index 54bba75d7..144df2063 100644
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@@ -636,7 +636,7 @@ KERNEL_FQ KERNEL_FA void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t
   }
 }
 
-KERNEL_FQ KERNEL_FA void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+KERNEL_FQ KERNEL_FA FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) void m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
   const u64 gid = get_global_id (0);
   const u64 lid = get_local_id (0);
diff --git a/src/autotune.c b/src/autotune.c
index a599e65be..4a4dc0d85 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -333,6 +333,21 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     // v7 autotuner is a lot more straight forward
 
+    if (kernel_threads_min < kernel_threads_max)
+    {
+      // there could be a situation, like in 18600, where we have a thread_min which is not a multiple of
+      // kernel_preferred_wgs_multiple. As long as it's only a threads_min, but not a threads_max, we
+      // should stick to at least kernel_preferred_wgs_multiple
+
+      if (kernel_threads_min % device_param->kernel_preferred_wgs_multiple)
+      {
+        if ((device_param->kernel_preferred_wgs_multiple >= kernel_threads_min) && (device_param->kernel_preferred_wgs_multiple <= kernel_threads_max))
+        {
+          kernel_threads = device_param->kernel_preferred_wgs_multiple;
+        }
+      }
+    }
+
     if (hashes && hashes->st_salts_buf)
     {
       u32 start = kernel_loops_max;
@@ -356,15 +371,15 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     for (u32 kernel_loops_test = kernel_loops; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
     {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_test, kernel_threads_min, 2);
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops_test, kernel_threads, 2);
 
-      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops_test, kernel_threads_min);
+      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops_test, kernel_threads);
       if (exec_msec > target_msec) break;
 
       // we want a little room for threads to play with so not full target_msec
       // but of course only if we are going to make use of that :)
 
-      if ((kernel_accel_min < kernel_accel_max) || (kernel_threads_min < kernel_threads_max))
+      if ((kernel_accel < kernel_accel_max) || (kernel_threads < kernel_threads_max))
       {
         if (exec_msec > target_msec / 8) break;
 
@@ -378,11 +393,14 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       kernel_loops = kernel_loops_test;
     }
 
-    for (u32 kernel_threads_test = kernel_threads_min; kernel_threads_test <= kernel_threads_max; kernel_threads_test <<= 1)
-    {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads_test, 2);
+    double exec_msec_init = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
+
+    float threads_eff_best = exec_msec_init / kernel_threads;
+
+    for (u32 kernel_threads_test = kernel_threads; kernel_threads_test <= kernel_threads_max; kernel_threads_test = (kernel_threads_test < device_param->kernel_preferred_wgs_multiple) ? kernel_threads_test << 1 : kernel_threads_test + device_param->kernel_preferred_wgs_multiple)
+    {
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads_test, 2);
 
-      //printf ("threads %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops, kernel_threads_test);
       if (exec_msec > target_msec) break;
 
       if (kernel_threads >= 32)
@@ -392,7 +410,14 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
         if (exec_msec > target_msec / 8) break;
       }
 
-      kernel_threads = kernel_threads_test;
+      float threads_eff_cur = exec_msec / kernel_threads_test;
+
+      if ((threads_eff_cur * 1.05) < threads_eff_best)
+      {
+        threads_eff_best = threads_eff_cur;
+
+        kernel_threads = kernel_threads_test;
+      }
     }
 
     #define STEPS_CNT 12
diff --git a/src/backend.c b/src/backend.c
index e324a2e01..789dcd7df 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -6438,7 +6438,7 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
-      device_param->has_shfw  = prop.arch.hasFunnelShift;
+      device_param->has_shfw  = true; // always reports false : prop.arch.hasFunnelShift;
 
       // device_available_mem
 
diff --git a/src/modules/module_13711.c b/src/modules/module_13711.c
index ff18b6c52..9e72c4b0f 100644
--- a/src/modules/module_13711.c
+++ b/src/modules/module_13711.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13712.c b/src/modules/module_13712.c
index b39eecfe7..26ad43579 100644
--- a/src/modules/module_13712.c
+++ b/src/modules/module_13712.c
@@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13713.c b/src/modules/module_13713.c
index d3c8d5fa4..7c67d69e5 100644
--- a/src/modules/module_13713.c
+++ b/src/modules/module_13713.c
@@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13721.c b/src/modules/module_13721.c
index b6c937875..14a28165d 100644
--- a/src/modules/module_13721.c
+++ b/src/modules/module_13721.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -346,7 +353,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13722.c b/src/modules/module_13722.c
index 3c92b2091..7850433b3 100644
--- a/src/modules/module_13722.c
+++ b/src/modules/module_13722.c
@@ -148,6 +148,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -363,7 +370,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13723.c b/src/modules/module_13723.c
index dd8f513f7..fae3d81ec 100644
--- a/src/modules/module_13723.c
+++ b/src/modules/module_13723.c
@@ -148,6 +148,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -363,7 +370,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13731.c b/src/modules/module_13731.c
index 27f0e5033..a3f175fd8 100644
--- a/src/modules/module_13731.c
+++ b/src/modules/module_13731.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13732.c b/src/modules/module_13732.c
index e78dbaa83..4ac4cc8e9 100644
--- a/src/modules/module_13732.c
+++ b/src/modules/module_13732.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13733.c b/src/modules/module_13733.c
index 1d7c36628..29da2b0ca 100644
--- a/src/modules/module_13733.c
+++ b/src/modules/module_13733.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13751.c b/src/modules/module_13751.c
index ee27acfd6..d2d0a50c2 100644
--- a/src/modules/module_13751.c
+++ b/src/modules/module_13751.c
@@ -166,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -381,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13752.c b/src/modules/module_13752.c
index 095758dd2..bf51b8d7b 100644
--- a/src/modules/module_13752.c
+++ b/src/modules/module_13752.c
@@ -166,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -381,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13753.c b/src/modules/module_13753.c
index 4eee98625..fadb4ffed 100644
--- a/src/modules/module_13753.c
+++ b/src/modules/module_13753.c
@@ -166,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -381,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13771.c b/src/modules/module_13771.c
index b753d171f..32951f7c2 100644
--- a/src/modules/module_13771.c
+++ b/src/modules/module_13771.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13772.c b/src/modules/module_13772.c
index 6494ee637..e6af92bfb 100644
--- a/src/modules/module_13772.c
+++ b/src/modules/module_13772.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13773.c b/src/modules/module_13773.c
index 6f2740347..32b6ddfb1 100644
--- a/src/modules/module_13773.c
+++ b/src/modules/module_13773.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29411.c b/src/modules/module_29411.c
index a3e7160eb..60fa66f6f 100644
--- a/src/modules/module_29411.c
+++ b/src/modules/module_29411.c
@@ -114,6 +114,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -356,7 +363,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29412.c b/src/modules/module_29412.c
index 969e6d355..df64b2a6c 100644
--- a/src/modules/module_29412.c
+++ b/src/modules/module_29412.c
@@ -114,6 +114,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -356,7 +363,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29413.c b/src/modules/module_29413.c
index 935d16dc5..7f455767d 100644
--- a/src/modules/module_29413.c
+++ b/src/modules/module_29413.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29421.c b/src/modules/module_29421.c
index 4c0e94163..69e4ba2d5 100644
--- a/src/modules/module_29421.c
+++ b/src/modules/module_29421.c
@@ -130,6 +130,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -372,7 +379,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29422.c b/src/modules/module_29422.c
index d32d1471e..f3b4cc98c 100644
--- a/src/modules/module_29422.c
+++ b/src/modules/module_29422.c
@@ -141,6 +141,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -383,7 +390,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29423.c b/src/modules/module_29423.c
index 988bc8a42..a038dd7a4 100644
--- a/src/modules/module_29423.c
+++ b/src/modules/module_29423.c
@@ -141,6 +141,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -383,7 +390,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29431.c b/src/modules/module_29431.c
index a22c24e3d..8555b19b7 100644
--- a/src/modules/module_29431.c
+++ b/src/modules/module_29431.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29432.c b/src/modules/module_29432.c
index 563b254c0..b9c54a1ca 100644
--- a/src/modules/module_29432.c
+++ b/src/modules/module_29432.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29433.c b/src/modules/module_29433.c
index 90e2033c9..519ff8fba 100644
--- a/src/modules/module_29433.c
+++ b/src/modules/module_29433.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29451.c b/src/modules/module_29451.c
index fe3c0737f..94d902273 100644
--- a/src/modules/module_29451.c
+++ b/src/modules/module_29451.c
@@ -151,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -393,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29452.c b/src/modules/module_29452.c
index 842f36f47..1c5ac7f1b 100644
--- a/src/modules/module_29452.c
+++ b/src/modules/module_29452.c
@@ -151,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -393,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29453.c b/src/modules/module_29453.c
index 6162337de..74322f135 100644
--- a/src/modules/module_29453.c
+++ b/src/modules/module_29453.c
@@ -151,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -393,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29471.c b/src/modules/module_29471.c
index 18d50d294..a2bfe7996 100644
--- a/src/modules/module_29471.c
+++ b/src/modules/module_29471.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29472.c b/src/modules/module_29472.c
index a4c0c275e..9a1614135 100644
--- a/src/modules/module_29472.c
+++ b/src/modules/module_29472.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29473.c b/src/modules/module_29473.c
index 744c1ac6d..b767dad92 100644
--- a/src/modules/module_29473.c
+++ b/src/modules/module_29473.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;

From 9f3d7711378f3d1a722de9e1c32e53427aa6150e Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 6 Jul 2025 07:59:56 +0200
Subject: [PATCH 34/37] fix build error on src/user_options.c

---
 src/user_options.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/user_options.c b/src/user_options.c
index 2647ac46a..cb27a9b47 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -1955,8 +1955,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
    || user_options->progress_only    == true
    || user_options->identify         == true
    || user_options->usage             > 0
-   || 
-      > 0
+   || user_options->hash_info         > 0
    || user_options->backend_info      > 0)
   {
     user_options->hwmon               = false;

From aa10bcf80e8d5e242be5e2a6017708d5da629b06 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 6 Jul 2025 08:06:24 +0200
Subject: [PATCH 35/37] update remaining user_options->hash_info checks

---
 src/bridges.c       | 12 ++++++------
 src/hashes.c        |  2 +-
 src/mpsp.c          |  2 +-
 src/outfile_check.c |  4 ++--
 src/potfile.c       |  2 +-
 src/restore.c       |  2 +-
 src/straight.c      |  2 +-
 src/tuningdb.c      |  2 +-
 src/wordlist.c      |  2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/bridges.c b/src/bridges.c
index 9b37aa25a..3df210f80 100644
--- a/src/bridges.c
+++ b/src/bridges.c
@@ -87,12 +87,12 @@ bool bridges_init (hashcat_ctx_t *hashcat_ctx)
   user_options_t  *user_options = hashcat_ctx->user_options;
   hashconfig_t    *hashconfig   = hashcat_ctx->hashconfig;
 
-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
   if (user_options->left         == true) return true;
   if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
   if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;
 
   // There is a problem here. At this point, hashconfig is not yet initialized.
   // This is because initializing hashconfig requires the module to be loaded,
@@ -241,12 +241,12 @@ bool bridges_salt_prepare (hashcat_ctx_t *hashcat_ctx)
   hashes_t        *hashes       = hashcat_ctx->hashes;
   user_options_t  *user_options = hashcat_ctx->user_options;
 
-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
   if (user_options->left         == true) return true;
   if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
   if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;
 
   if (bridge_ctx->enabled == false) return true;
 
diff --git a/src/hashes.c b/src/hashes.c
index 40d94941f..767c30824 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -1133,7 +1133,7 @@ int hashes_init_stage1 (hashcat_ctx_t *hashcat_ctx)
 
     hashes_cnt = 1;
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
   }
   else if (user_options->keyspace == true)
diff --git a/src/mpsp.c b/src/mpsp.c
index 3efe345d7..0d2ae9ae0 100644
--- a/src/mpsp.c
+++ b/src/mpsp.c
@@ -1403,8 +1403,8 @@ int mask_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
diff --git a/src/outfile_check.c b/src/outfile_check.c
index e681b7263..afee8ddb8 100644
--- a/src/outfile_check.c
+++ b/src/outfile_check.c
@@ -343,13 +343,13 @@ int outcheck_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   outcheck_ctx->enabled = false;
 
+  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->speed_only    == true) return 0;
   if (user_options->progress_only == true) return 0;
   if (user_options->identify      == true) return 0;
-  if (user_options->backend_info   > 0)    return 0;
 
   if (hashconfig->outfile_check_disable == true) return 0;
   if (user_options->outfile_check_timer == 0)    return 0;
diff --git a/src/potfile.c b/src/potfile.c
index afafca2f1..9a87ed027 100644
--- a/src/potfile.c
+++ b/src/potfile.c
@@ -115,9 +115,9 @@ int potfile_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage            > 0)     return 0;
   if (user_options->backend_info     > 0)     return 0;
+  if (user_options->hash_info        > 0)     return 0;
 
   if (user_options->benchmark       == true)  return 0;
-  if (user_options->hash_info       == true)  return 0;
   if (user_options->keyspace        == true)  return 0;
   if (user_options->stdout_flag     == true)  return 0;
   if (user_options->speed_only      == true)  return 0;
diff --git a/src/restore.c b/src/restore.c
index 2ca95ba92..e5c141bfc 100644
--- a/src/restore.c
+++ b/src/restore.c
@@ -312,9 +312,9 @@ int restore_ctx_init (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
 
   if (user_options->usage            > 0)     return 0;
   if (user_options->backend_info     > 0)     return 0;
+  if (user_options->hash_info        > 0)     return 0;
 
   if (user_options->benchmark       == true)  return 0;
-  if (user_options->hash_info       == true)  return 0;
   if (user_options->keyspace        == true)  return 0;
   if (user_options->left            == true)  return 0;
   if (user_options->show            == true)  return 0;
diff --git a/src/straight.c b/src/straight.c
index 36d3a8eb8..58212be52 100644
--- a/src/straight.c
+++ b/src/straight.c
@@ -264,11 +264,11 @@ int straight_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
-  if (user_options->hash_info    == true) return 0;
 
   if (user_options->attack_mode  == ATTACK_MODE_BF) return 0;
 
diff --git a/src/tuningdb.c b/src/tuningdb.c
index 41d3795aa..2c507feb7 100644
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@@ -56,8 +56,8 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/wordlist.c b/src/wordlist.c
index 1776b4885..bfe09de89 100644
--- a/src/wordlist.c
+++ b/src/wordlist.c
@@ -697,9 +697,9 @@ int wl_data_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
   if (user_options->benchmark    == true) return 0;
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->version      == true) return 0;
 

From 0576c4149100839ce87c0ac8fe8ad3d0836b2ad0 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sun, 6 Jul 2025 10:14:20 +0200
Subject: [PATCH 36/37] Updated autotune to set initial values for accel,
 threads, and loop based on theoretical assumptions, with the idea for more
 accurate early results from measured test runs. Updated autotune to use the
 iteration count of the first user-defined hash instead of the self-test hash
 for slow hash tuning, assuming consistency across the hash list. Updated
 autotune to prefer best-efficiency thread count only if it is at least 6%
 better than the max thread count, improving consistency in thread and accel
 values while allowing exceptions for special modes like 18600. Changed
 default theoretical free memory by applying a reduction from max memory from
 20% changed to 34%/. This happens only when runtime/OS cannot provide
 low-level free memory data. Applied the same logic using --backend-keep-free
 percentage to host memory during early setup, when hashcat auto-reduces
 thread and accel counts to stay within limits, and that per compute device.
 Changed terminal output from "Host memory required for this attack: ..." to
 "Host memory allocated for this attack: ...", and added free host memory as
 reference.

---
 src/autotune.c | 85 +++++++++++++++++++++++++++++++++++++++-----------
 src/backend.c  | 17 ++++++++--
 src/main.c     | 12 ++++++-
 3 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/src/autotune.c b/src/autotune.c
index 4a4dc0d85..7938fb259 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -332,6 +332,22 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     }
 
     // v7 autotuner is a lot more straight forward
+    // we start with some purely theoretical values as a base, then move on to some meassured tests
+
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
+    {
+      if (kernel_accel_min < kernel_accel_max)
+      {
+        // let's also do some minimal accel, this is only to improve early meassurements taken with try_run()
+
+        const u32 kernel_accel_start = previous_power_of_two (kernel_accel_max / 8);
+
+        if ((kernel_accel_start >= kernel_accel_min) && (kernel_accel_start <= kernel_accel_max))
+        {
+          kernel_accel = kernel_accel_start;
+        }
+      }
+    }
 
     if (kernel_threads_min < kernel_threads_max)
     {
@@ -348,24 +364,42 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       }
     }
 
-    if (hashes && hashes->st_salts_buf)
+    if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
     {
-      u32 start = kernel_loops_max;
-
-      const u32 salt_iter = hashes->st_salts_buf->salt_iter;
-
-      if (salt_iter)
+      if (hashes && hashes->salts_buf)
       {
-        start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
-        start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
+        u32 start = kernel_loops_max;
 
-        if (((hashes->st_salts_buf->salt_iter + 0) % 125) == 0) start = MIN (start, 125);
-        if (((hashes->st_salts_buf->salt_iter + 1) % 125) == 0) start = MIN (start, 125);
+        const u32 salt_iter = hashes->salts_buf->salt_iter; // we use the first salt as reference
 
-        if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+        if (salt_iter)
         {
-          kernel_loops = start;
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter));
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter + 1));
+
+          if (((hashes->salts_buf->salt_iter + 0) % 125) == 0) start = MIN (start, 125);
+          if (((hashes->salts_buf->salt_iter + 1) % 125) == 0) start = MIN (start, 125);
+
+          if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+          {
+            kernel_loops = start;
+          }
         }
+        else
+        {
+          // how can there be a slow hash with no iterations?
+        }
+      }
+    }
+    else
+    {
+      // let's also do some minimal loops, this is only to improve early meassurements taken with try_run()
+
+      const u32 kernel_loops_start = previous_power_of_two (kernel_loops_max / 4);
+
+      if ((kernel_loops_start >= kernel_loops_min) && (kernel_loops_start <= kernel_loops_max))
+      {
+        kernel_loops = kernel_loops_start;
       }
     }
 
@@ -396,30 +430,45 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     double exec_msec_init = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
 
     float threads_eff_best = exec_msec_init / kernel_threads;
+    u32   threads_cnt_best = kernel_threads;
+
+    float threads_eff_prev = 0;
+    u32   threads_cnt_prev = 0;
 
     for (u32 kernel_threads_test = kernel_threads; kernel_threads_test <= kernel_threads_max; kernel_threads_test = (kernel_threads_test < device_param->kernel_preferred_wgs_multiple) ? kernel_threads_test << 1 : kernel_threads_test + device_param->kernel_preferred_wgs_multiple)
     {
       double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads_test, 2);
 
+      //printf ("thread %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops, kernel_threads_test);
       if (exec_msec > target_msec) break;
 
       if (kernel_threads >= 32)
       {
         // we want a little room for accel to play with so not full target_msec
 
-        if (exec_msec > target_msec / 8) break;
+        if (exec_msec > target_msec / 4) break;
       }
 
-      float threads_eff_cur = exec_msec / kernel_threads_test;
+      kernel_threads = kernel_threads_test;
 
-      if ((threads_eff_cur * 1.05) < threads_eff_best)
+      threads_eff_prev = exec_msec / kernel_threads_test;
+      threads_cnt_prev = kernel_threads_test;
+
+      //printf ("%f\n", threads_eff_prev);
+
+      if (threads_eff_prev < threads_eff_best)
       {
-        threads_eff_best = threads_eff_cur;
-
-        kernel_threads = kernel_threads_test;
+        threads_eff_best = threads_eff_prev;
+        threads_cnt_best = threads_cnt_prev;
       }
     }
 
+    // now we decide to choose either maximum or in some extreme cases prefer more efficient ones
+    if ((threads_eff_best * 1.06) < threads_eff_prev)
+    {
+      kernel_threads = threads_cnt_best;
+    }
+
     #define STEPS_CNT 12
 
     // now we tune for kernel-accel but with the new kernel-loops from previous loop set
diff --git a/src/backend.c b/src/backend.c
index 789dcd7df..6e8fe4aad 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -10217,10 +10217,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           {
             const u64 device_available_mem_sav = device_param->device_available_mem;
 
-            const u64 device_available_mem_new = device_available_mem_sav - (device_available_mem_sav * 0.2);
+            const u64 device_available_mem_new = device_available_mem_sav - (device_available_mem_sav * 0.34);
 
             event_log_warning (hashcat_ctx, "* Device #%u: This system does not offer any reliable method to query actual free memory. Estimated base: %" PRIu64, device_id + 1, device_available_mem_sav);
-            event_log_warning (hashcat_ctx, "             Assuming normal desktop activity, reducing estimate by 20%%: %" PRIu64, device_available_mem_new);
+            event_log_warning (hashcat_ctx, "             Assuming normal desktop activity, reducing estimate by 34%%: %" PRIu64, device_available_mem_new);
             event_log_warning (hashcat_ctx, "             This can hurt performance drastically, especially on memory-heavy algorithms.");
             event_log_warning (hashcat_ctx, "             You can adjust this percentage using --backend-devices-keepfree");
             event_log_warning (hashcat_ctx, NULL);
@@ -16275,12 +16275,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     {
       const u64 GiB4 = 4ULL * 1024 * 1024 * 1024;
 
-      event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB");
+      event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB is available per compute device");
 
       accel_limit_host = GiB4;
     }
     else
     {
+      if (user_options->backend_devices_keepfree)
+      {
+        accel_limit_host = ((u64) accel_limit_host * (100 - user_options->backend_devices_keepfree)) / 100;
+      }
+      else
+      {
+        accel_limit_host = accel_limit_host - (accel_limit_host * 0.34);
+      }
+
+      accel_limit_host /= backend_ctx->backend_devices_active;
+
       // even tho let's not be greedy
 
       const u64 GiB8 = 8ULL * 1024 * 1024 * 1024;
diff --git a/src/main.c b/src/main.c
index 994b0bc9f..69d89e81b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -643,7 +643,17 @@ static void main_backend_session_hostmem (MAYBE_UNUSED hashcat_ctx_t *hashcat_ct
 
   const u64 *hostmem = (const u64 *) buf;
 
-  event_log_info (hashcat_ctx, "Host memory required for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  u64 free_memory = 0;
+
+  if (get_free_memory (&free_memory) == false)
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  }
+  else
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB (%" PRIu64 " MB free)", *hostmem / (1024 * 1024), free_memory / (1024 * 1024));
+  }
+
   event_log_info (hashcat_ctx, NULL);
 }
 

From f663abee4426cba0cc80ea0ce9adf5218fae85c9 Mon Sep 17 00:00:00 2001
From: Gabriele Gristina <matrix@users.noreply.github.com>
Date: Sun, 6 Jul 2025 21:28:37 +0200
Subject: [PATCH 37/37] Added workaround to get rid of internal runtimes memory
 leaks

As of now, especially in the benchmark mode, hashcat will not go to create and destroy context and command-queue for each enabled device each time it switches from one hash-mode to the next.
Specifically using OpenCL with an NVIDIA device, it was not possible to complete the benchmark because clCreateContext has memory leaks that slowly consume all available GPU memory until hashcat can activate a new context and disable the device.

Avoid deprecated HIP functions

All hipCtx* features have been declared deprecated, so we have replaced them with the new ones, also fixing a critical bug on handling multiple AMD devices in the same system.
---
 docs/changes.txt  |   2 +
 include/ext_hip.h | 976 +++++++++++++++++++++++-----------------------
 src/autotune.c    |   7 +-
 src/backend.c     | 370 +++++++-----------
 src/dispatch.c    |  14 +-
 src/ext_hip.c     | 210 +++++++---
 src/selftest.c    |   4 +-
 7 files changed, 793 insertions(+), 790 deletions(-)

diff --git a/docs/changes.txt b/docs/changes.txt
index de2d26bcd..c2af93956 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -129,6 +129,7 @@
 - Alias Devices: Prevents hashcat, when started with x86_64 emulation on Apple Silicon, from showing the Apple M1 OpenCL CPU as an alias for the Apple M1 Metal GPU
 - Apple Driver: Automatically enable GPU support on Apple OpenCL instead of CPU support
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
+- Backend: Added workaround to get rid of internal runtimes memory leaks
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
 - Backend: Splitting backend_ctx_devices_init into smaller runtime-specific functions
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
@@ -141,6 +142,7 @@
 - Building: Support building windows binaries on macOS using MinGW
 - Dependencies: Updated OpenCL-Headers to v2024.10.24 (commit 265df85)
 - Documents: Updated BUILD.md and added BUILD_macOS.md (containing instructions for building windows binaries on macOS)
+- HIP Backend: Avoid deprecated functions
 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
 - Modules: Updated module_unstable_warning
 - Open Document Format: Added support for small documents with content length < 1024
diff --git a/include/ext_hip.h b/include/ext_hip.h
index d0f53d173..3c0b8433a 100644
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
@@ -12,20 +12,22 @@
 
 // start: driver_types.h
 
-typedef void* hipDeviceptr_t;
+typedef void *hipDeviceptr_t;
+
+typedef enum hipFunction_attribute
+{
+  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            // The maximum number of threads per block. Depends on function and device.
+  HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                // The statically allocated shared memory size in bytes per block required by the function.
+  HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 // The user-allocated constant memory by the function in bytes.
+  HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 // The local memory usage of each thread by this function in bytes.
+  HIP_FUNC_ATTRIBUTE_NUM_REGS,                         // The number of registers used by each thread of this function.
+  HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      // PTX version
+  HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   // Binary version
+  HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    // Cache mode
+  HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    // The maximum dynamic shared memory per block for this function in bytes.
+  HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, // The shared memory carveout preference in percent of the maximum shared memory.
+  HIP_FUNC_ATTRIBUTE_MAX
 
-typedef enum hipFunction_attribute {
-    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            ///< The maximum number of threads per block. Depends on function and device.
-    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                ///< The statically allocated shared memory size in bytes per block required by the function.
-    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 ///< The user-allocated constant memory by the function in bytes.
-    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 ///< The local memory usage of each thread by this function in bytes.
-    HIP_FUNC_ATTRIBUTE_NUM_REGS,                         ///< The number of registers used by each thread of this function.
-    HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      ///< PTX version
-    HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   ///< Binary version
-    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    ///< Cache mode
-    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    ///< The maximum dynamic shared memory per block for this function in bytes.
-    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, ///< The shared memory carveout preference in percent of the maximum shared memory.
-    HIP_FUNC_ATTRIBUTE_MAX
 } hipFunction_attribute;
 
 // stop: driver_types.h
@@ -47,120 +49,102 @@ typedef struct ihipModuleSymbol_t* hipFunction_t;
 #define __HIP_NODISCARD
 #endif
 
-typedef enum __HIP_NODISCARD hipError_t {
-    hipSuccess = 0,  ///< Successful completion.
-    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
-                               ///< or not in an acceptable range.
-    hipErrorOutOfMemory = 2,
-    // Deprecated
-    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
-    hipErrorNotInitialized = 3,
-    // Deprecated
-    hipErrorInitializationError = 3,
-    hipErrorDeinitialized = 4,
-    hipErrorProfilerDisabled = 5,
-    hipErrorProfilerNotInitialized = 6,
-    hipErrorProfilerAlreadyStarted = 7,
-    hipErrorProfilerAlreadyStopped = 8,
-    hipErrorInvalidConfiguration = 9,
-    hipErrorInvalidPitchValue = 12,
-    hipErrorInvalidSymbol = 13,
-    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
-    hipErrorInsufficientDriver = 35,
-    hipErrorMissingConfiguration = 52,
-    hipErrorPriorLaunchFailure = 53,
-    hipErrorInvalidDeviceFunction = 98,
-    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidImage = 200,
-    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent = 202,
-    hipErrorMapFailed = 205,
-    // Deprecated
-    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorUnmapFailed = 206,
-    hipErrorArrayIsMapped = 207,
-    hipErrorAlreadyMapped = 208,
-    hipErrorNoBinaryForGpu = 209,
-    hipErrorAlreadyAcquired = 210,
-    hipErrorNotMapped = 211,
-    hipErrorNotMappedAsArray = 212,
-    hipErrorNotMappedAsPointer = 213,
-    hipErrorECCNotCorrectable = 214,
-    hipErrorUnsupportedLimit = 215,
-    hipErrorContextAlreadyInUse = 216,
-    hipErrorPeerAccessUnsupported = 217,
-    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext = 219,
-    hipErrorInvalidSource = 300,
-    hipErrorFileNotFound = 301,
-    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed = 303,
-    hipErrorOperatingSystem = 304,
-    hipErrorInvalidHandle = 400,
-    // Deprecated
-    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorIllegalState = 401, ///< Resource required is not in a valid state to perform operation.
-    hipErrorNotFound = 500,
-    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
-                             ///< ready.  This is not actually an error, but is used to distinguish
-                             ///< from hipSuccess (which indicates completion).  APIs that return
-                             ///< this error include hipEventQuery and hipStreamQuery.
-    hipErrorIllegalAddress = 700,
-    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
-    hipErrorLaunchTimeOut = 702,
-    hipErrorPeerAccessAlreadyEnabled =
-        704,  ///< Peer access was already enabled from the current device.
-    hipErrorPeerAccessNotEnabled =
-        705,  ///< Peer access was never enabled from the current device.
-    hipErrorSetOnActiveProcess = 708,
-    hipErrorContextIsDestroyed = 709,
-    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
-    hipErrorHostMemoryAlreadyRegistered =
-        712,  ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered =
-        713,  ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorLaunchFailure =
-        719,  ///< An exception occurred on the device while executing a kernel.
-    hipErrorCooperativeLaunchTooLarge =
-        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
-              ///< that was launched via cooperative launch APIs exceeds the maximum number of
-              ///< allowed blocks for the current device
-    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
-    hipErrorStreamCaptureUnsupported = 900,  ///< The operation is not permitted when the stream
-                                             ///< is capturing.
-    hipErrorStreamCaptureInvalidated = 901,  ///< The current capture sequence on the stream
-                                             ///< has been invalidated due to a previous error.
-    hipErrorStreamCaptureMerge = 902,  ///< The operation would have resulted in a merge of
-                                       ///< two independent capture sequences.
-    hipErrorStreamCaptureUnmatched = 903,  ///< The capture was not initiated in this stream.
-    hipErrorStreamCaptureUnjoined = 904,  ///< The capture sequence contains a fork that was not
-                                          ///< joined to the primary stream.
-    hipErrorStreamCaptureIsolation = 905,  ///< A dependency would have been created which crosses
-                                           ///< the capture sequence boundary. Only implicit
-                                           ///< in-stream ordering dependencies  are allowed
-                                           ///< to cross the boundary
-    hipErrorStreamCaptureImplicit = 906,  ///< The operation would have resulted in a disallowed
-                                          ///< implicit dependency on a current capture sequence
-                                          ///< from hipStreamLegacy.
-    hipErrorCapturedEvent = 907,  ///< The operation is not permitted on an event which was last
-                                  ///< recorded in a capturing stream.
-    hipErrorStreamCaptureWrongThread = 908,  ///< A stream capture sequence not initiated with
-                                             ///< the hipStreamCaptureModeRelaxed argument to
-                                             ///< hipStreamBeginCapture was passed to
-                                             ///< hipStreamEndCapture in a different thread.
-    hipErrorGraphExecUpdateFailure = 910,  ///< This error indicates that the graph update
-                                           ///< not performed because it included changes which
-                                           ///< violated constraints specific to instantiated graph
-                                           ///< update.
-    hipErrorUnknown = 999,  //< Unknown error.
-    // HSA Runtime Error Codes start here.
-    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
-                                   ///< in production systems.
-    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
-                                  ///< not seen in production systems.
-    hipErrorTbd  ///< Marker that more error codes are needed.
+typedef enum __HIP_NODISCARD hipError_t
+{
+  hipSuccess = 0,                            // Successful completion.
+  hipErrorInvalidValue = 1,                  // One or more of the parameters passed to the API call is NULL
+                                             // or not in an acceptable range.
+  hipErrorOutOfMemory = 2,
+  // Deprecated
+  hipErrorMemoryAllocation = 2,              // Memory allocation error.
+  hipErrorNotInitialized = 3,
+  // Deprecated
+  hipErrorInitializationError = 3,
+  hipErrorDeinitialized = 4,
+  hipErrorProfilerDisabled = 5,
+  hipErrorProfilerNotInitialized = 6,
+  hipErrorProfilerAlreadyStarted = 7,
+  hipErrorProfilerAlreadyStopped = 8,
+  hipErrorInvalidConfiguration = 9,
+  hipErrorInvalidPitchValue = 12,
+  hipErrorInvalidSymbol = 13,
+  hipErrorInvalidDevicePointer = 17,         // Invalid Device Pointer
+  hipErrorInvalidMemcpyDirection = 21,       // Invalid memory copy direction
+  hipErrorInsufficientDriver = 35,
+  hipErrorMissingConfiguration = 52,
+  hipErrorPriorLaunchFailure = 53,
+  hipErrorInvalidDeviceFunction = 98,
+  hipErrorNoDevice = 100,                    // Call to hipGetDeviceCount returned 0 devices
+  hipErrorInvalidDevice = 101,               // DeviceID must be in range 0...#compute-devices.
+  hipErrorInvalidImage = 200,
+  hipErrorInvalidContext = 201,              // Produced when input context is invalid.
+  hipErrorContextAlreadyCurrent = 202,
+  hipErrorMapFailed = 205,
+  // Deprecated
+  hipErrorMapBufferObjectFailed = 205,       // Produced when the IPC memory attach failed from ROCr.
+  hipErrorUnmapFailed = 206,
+  hipErrorArrayIsMapped = 207,
+  hipErrorAlreadyMapped = 208,
+  hipErrorNoBinaryForGpu = 209,
+  hipErrorAlreadyAcquired = 210,
+  hipErrorNotMapped = 211,
+  hipErrorNotMappedAsArray = 212,
+  hipErrorNotMappedAsPointer = 213,
+  hipErrorECCNotCorrectable = 214,
+  hipErrorUnsupportedLimit = 215,
+  hipErrorContextAlreadyInUse = 216,
+  hipErrorPeerAccessUnsupported = 217,
+  hipErrorInvalidKernelFile = 218,           // In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+  hipErrorInvalidGraphicsContext = 219,
+  hipErrorInvalidSource = 300,
+  hipErrorFileNotFound = 301,
+  hipErrorSharedObjectSymbolNotFound = 302,
+  hipErrorSharedObjectInitFailed = 303,
+  hipErrorOperatingSystem = 304,
+  hipErrorInvalidHandle = 400,
+  // Deprecated
+  hipErrorInvalidResourceHandle = 400,       // Resource handle (hipEvent_t or hipStream_t) invalid.
+  hipErrorIllegalState = 401,                // Resource required is not in a valid state to perform operation.
+  hipErrorNotFound = 500,
+  hipErrorNotReady = 600,                    // Indicates that asynchronous operations enqueued earlier are not ready.
+                                             // This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion).
+                                             // APIs that return this error include hipEventQuery and hipStreamQuery.
+  hipErrorIllegalAddress = 700,
+  hipErrorLaunchOutOfResources = 701,        // Out of resources error.
+  hipErrorLaunchTimeOut = 702,
+  hipErrorPeerAccessAlreadyEnabled = 704,    // Peer access was already enabled from the current device.
+  hipErrorPeerAccessNotEnabled = 705,        // Peer access was never enabled from the current device.
+  hipErrorSetOnActiveProcess = 708,
+  hipErrorContextIsDestroyed = 709,
+  hipErrorAssert = 710,                      // Produced when the kernel calls assert.
+  hipErrorHostMemoryAlreadyRegistered = 712, // Produced when trying to lock a page-locked memory.
+  hipErrorHostMemoryNotRegistered = 713,     // Produced when trying to unlock a non-page-locked memory.
+  hipErrorLaunchFailure = 719,               // An exception occurred on the device while executing a kernel.
+  hipErrorCooperativeLaunchTooLarge = 720,   // This error indicates that the number of blocks launched per grid for a kernel
+                                             // that was launched via cooperative launch APIs exceeds the maximum number of
+                                             // allowed blocks for the current device
+  hipErrorNotSupported = 801,                // Produced when the hip API is not supported/implemented
+  hipErrorStreamCaptureUnsupported = 900,    // The operation is not permitted when the stream is capturing.
+  hipErrorStreamCaptureInvalidated = 901,    // The current capture sequence on the stream
+                                             // has been invalidated due to a previous error.
+  hipErrorStreamCaptureMerge = 902,          // The operation would have resulted in a merge of two independent capture sequences.
+  hipErrorStreamCaptureUnmatched = 903,      // The capture was not initiated in this stream.
+  hipErrorStreamCaptureUnjoined = 904,       // The capture sequence contains a fork that was not joined to the primary stream.
+  hipErrorStreamCaptureIsolation = 905,      // A dependency would have been created which crosses the capture sequence boundary.
+                                             // Only implicit in-stream ordering dependencies are allowed to cross the boundary
+  hipErrorStreamCaptureImplicit = 906,       // The operation would have resulted in a disallowed implicit dependency on a current
+                                             // capture sequence from hipStreamLegacy.
+  hipErrorCapturedEvent = 907,               // The operation is not permitted on an event which was last recorded in a capturing stream.
+  hipErrorStreamCaptureWrongThread = 908,    // A stream capture sequence not initiated with the hipStreamCaptureModeRelaxed argument to
+                                             // hipStreamBeginCapture was passed to hipStreamEndCapture in a different thread.
+  hipErrorGraphExecUpdateFailure = 910,      // This error indicates that the graph update not performed because it included changes which
+                                             // violated constraints specific to instantiated graph update.
+  hipErrorUnknown = 999,                     // Unknown error.
+  // HSA Runtime Error Codes start here.
+  hipErrorRuntimeMemory = 1052,              // HSA runtime memory call returned error. Typically not seen in production systems.
+  hipErrorRuntimeOther = 1053,               // HSA runtime call other than memory returned error. Typically not seen in production systems.
+  hipErrorTbd                                // Marker that more error codes are needed.
+
 } hipError_t;
 
 #undef __HIP_NODISCARD
@@ -170,178 +154,178 @@ typedef enum __HIP_NODISCARD hipError_t {
  * hipDeviceAttribute_t
  * hipDeviceAttributeUnused number: 5
  */
-typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeCudaCompatibleBegin = 0,
+typedef enum hipDeviceAttribute_t
+{
+  hipDeviceAttributeCudaCompatibleBegin = 0,
 
-    hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled.
-    hipDeviceAttributeAccessPolicyMaxWindowSize,        ///< Cuda only. The maximum size of the window policy in bytes.
-    hipDeviceAttributeAsyncEngineCount,                 ///< Asynchronous engines number.
-    hipDeviceAttributeCanMapHostMemory,                 ///< Whether host memory can be mapped into device address space
-    hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Device can access host registered memory
-                                                        ///< at the same virtual address as the CPU
-    hipDeviceAttributeClockRate,                        ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeComputeMode,                      ///< Compute mode that device is currently in.
-    hipDeviceAttributeComputePreemptionSupported,       ///< Device supports Compute Preemption.
-    hipDeviceAttributeConcurrentKernels,                ///< Device can possibly execute multiple kernels concurrently.
-    hipDeviceAttributeConcurrentManagedAccess,          ///< Device can coherently access managed memory concurrently with the CPU
-    hipDeviceAttributeCooperativeLaunch,                ///< Support cooperative launch
-    hipDeviceAttributeCooperativeMultiDeviceLaunch,     ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeDeviceOverlap,                    ///< Device can concurrently copy memory and execute a kernel.
-                                                        ///< Deprecated. Use instead asyncEngineCount.
-    hipDeviceAttributeDirectManagedMemAccessFromHost,   ///< Host can directly access managed memory on
-                                                        ///< the device without migration
-    hipDeviceAttributeGlobalL1CacheSupported,           ///< Device supports caching globals in L1
-    hipDeviceAttributeHostNativeAtomicSupported,        ///< Link between the device and the host supports native atomic operations
-    hipDeviceAttributeIntegrated,                       ///< Device is integrated GPU
-    hipDeviceAttributeIsMultiGpuBoard,                  ///< Multiple GPU devices.
-    hipDeviceAttributeKernelExecTimeout,                ///< Run time limit for kernels executed on the device
-    hipDeviceAttributeL2CacheSize,                      ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
-    hipDeviceAttributeLocalL1CacheSupported,            ///< caching locals in L1 is supported
-    hipDeviceAttributeLuid,                             ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeLuidDeviceNodeMask,               ///< Luid device node mask. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeComputeCapabilityMajor,           ///< Major compute capability version number.
-    hipDeviceAttributeManagedMemory,                    ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Max block size per multiprocessor
-    hipDeviceAttributeMaxBlockDimX,                     ///< Max block size in width.
-    hipDeviceAttributeMaxBlockDimY,                     ///< Max block size in height.
-    hipDeviceAttributeMaxBlockDimZ,                     ///< Max block size in depth.
-    hipDeviceAttributeMaxGridDimX,                      ///< Max grid size  in width.
-    hipDeviceAttributeMaxGridDimY,                      ///< Max grid size  in height.
-    hipDeviceAttributeMaxGridDimZ,                      ///< Max grid size  in depth.
-    hipDeviceAttributeMaxSurface1D,                     ///< Maximum size of 1D surface.
-    hipDeviceAttributeMaxSurface1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered surface.
-    hipDeviceAttributeMaxSurface2D,                     ///< Maximum dimension (width, height) of 2D surface.
-    hipDeviceAttributeMaxSurface2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered surface.
-    hipDeviceAttributeMaxSurface3D,                     ///< Maximum dimension (width, height, depth) of 3D surface.
-    hipDeviceAttributeMaxSurfaceCubemap,                ///< Cuda only. Maximum dimensions of Cubemap surface.
-    hipDeviceAttributeMaxSurfaceCubemapLayered,         ///< Cuda only. Maximum dimension of Cubemap layered surface.
-    hipDeviceAttributeMaxTexture1DWidth,                ///< Maximum size of 1D texture.
-    hipDeviceAttributeMaxTexture1DLayered,              ///< Maximum dimensions of 1D layered texture.
-    hipDeviceAttributeMaxTexture1DLinear,               ///< Maximum number of elements allocatable in a 1D linear texture.
-                                                        ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
-    hipDeviceAttributeMaxTexture1DMipmap,               ///< Maximum size of 1D mipmapped texture.
-    hipDeviceAttributeMaxTexture2DWidth,                ///< Maximum dimension width of 2D texture.
-    hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension hight of 2D texture.
-    hipDeviceAttributeMaxTexture2DGather,               ///< Maximum dimensions of 2D texture if gather operations  performed.
-    hipDeviceAttributeMaxTexture2DLayered,              ///< Maximum dimensions of 2D layered texture.
-    hipDeviceAttributeMaxTexture2DLinear,               ///< Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
-    hipDeviceAttributeMaxTexture2DMipmap,               ///< Maximum dimensions of 2D mipmapped texture.
-    hipDeviceAttributeMaxTexture3DWidth,                ///< Maximum dimension width of 3D texture.
-    hipDeviceAttributeMaxTexture3DHeight,               ///< Maximum dimension height of 3D texture.
-    hipDeviceAttributeMaxTexture3DDepth,                ///< Maximum dimension depth of 3D texture.
-    hipDeviceAttributeMaxTexture3DAlt,                  ///< Maximum dimensions of alternate 3D texture.
-    hipDeviceAttributeMaxTextureCubemap,                ///< Maximum dimensions of Cubemap texture
-    hipDeviceAttributeMaxTextureCubemapLayered,         ///< Maximum dimensions of Cubemap layered texture.
-    hipDeviceAttributeMaxThreadsDim,                    ///< Maximum dimension of a block
-    hipDeviceAttributeMaxThreadsPerBlock,               ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,      ///< Maximum resident threads per multiprocessor.
-    hipDeviceAttributeMaxPitch,                         ///< Maximum pitch in bytes allowed by memory copies
-    hipDeviceAttributeMemoryBusWidth,                   ///< Global memory bus width in bits.
-    hipDeviceAttributeMemoryClockRate,                  ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeComputeCapabilityMinor,           ///< Minor compute capability version number.
-    hipDeviceAttributeMultiGpuBoardGroupID,             ///< Unique ID of device group on the same multi-GPU board
-    hipDeviceAttributeMultiprocessorCount,              ///< Number of multiprocessors on the device.
-    hipDeviceAttributeUnused1,                          ///< Previously hipDeviceAttributeName
-    hipDeviceAttributePageableMemoryAccess,             ///< Device supports coherently accessing pageable memory
-                                                        ///< without calling hipHostRegister on it
-    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables
-    hipDeviceAttributePciBusId,                         ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,                      ///< PCI Device ID.
-    hipDeviceAttributePciDomainID,                      ///< PCI Domain ID.
-    hipDeviceAttributePersistingL2CacheMaxSize,         ///< Maximum l2 persisting lines capacity in bytes
-    hipDeviceAttributeMaxRegistersPerBlock,             ///< 32-bit registers available to a thread block. This number is shared
-                                                        ///< by all thread blocks simultaneously resident on a multiprocessor.
-    hipDeviceAttributeMaxRegistersPerMultiprocessor,    ///< 32-bit registers available per block.
-    hipDeviceAttributeReservedSharedMemPerBlock,        ///< Shared memory reserved by CUDA driver per block.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,          ///< Maximum shared memory available per block in bytes.
-    hipDeviceAttributeSharedMemPerBlockOptin,           ///< Maximum shared memory per block usable by special opt in.
-    hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Shared memory available per multiprocessor.
-    hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision.
-    hipDeviceAttributeStreamPrioritiesSupported,        ///< Whether to support stream priorities.
-    hipDeviceAttributeSurfaceAlignment,                 ///< Alignment requirement for surfaces
-    hipDeviceAttributeTccDriver,                        ///< Cuda only. Whether device is a Tesla device using TCC driver
-    hipDeviceAttributeTextureAlignment,                 ///< Alignment requirement for textures
-    hipDeviceAttributeTexturePitchAlignment,            ///< Pitch alignment requirement for 2D texture references bound to pitched memory;
-    hipDeviceAttributeTotalConstantMemory,              ///< Constant memory size in bytes.
-    hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on devicice.
-    hipDeviceAttributeUnifiedAddressing,                ///< Cuda only. An unified address space shared with the host.
-    hipDeviceAttributeUnused2,                          ///< Previously hipDeviceAttributeUuid
-    hipDeviceAttributeWarpSize,                         ///< Warp size in threads.
-    hipDeviceAttributeMemoryPoolsSupported,             ///< Device supports HIP Stream Ordered Memory Allocator
-    hipDeviceAttributeVirtualMemoryManagementSupported, ///< Device supports HIP virtual memory management
-    hipDeviceAttributeHostRegisterSupported,            ///< Can device support host memory registration via hipHostRegister
-    hipDeviceAttributeMemoryPoolSupportedHandleTypes,   ///< Supported handle mask for HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, // Whether ECC support is enabled.
+  hipDeviceAttributeAccessPolicyMaxWindowSize,                // Cuda only. The maximum size of the window policy in bytes.
+  hipDeviceAttributeAsyncEngineCount,                         // Asynchronous engines number.
+  hipDeviceAttributeCanMapHostMemory,                         // Whether host memory can be mapped into device address space
+  hipDeviceAttributeCanUseHostPointerForRegisteredMem,        // Device can access host registered memory
+                                                              // at the same virtual address as the CPU
+  hipDeviceAttributeClockRate,                                // Peak clock frequency in kilohertz.
+  hipDeviceAttributeComputeMode,                              // Compute mode that device is currently in.
+  hipDeviceAttributeComputePreemptionSupported,               // Device supports Compute Preemption.
+  hipDeviceAttributeConcurrentKernels,                        // Device can possibly execute multiple kernels concurrently.
+  hipDeviceAttributeConcurrentManagedAccess,                  // Device can coherently access managed memory concurrently with the CPU
+  hipDeviceAttributeCooperativeLaunch,                        // Support cooperative launch
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,             // Support cooperative launch on multiple devices
+  hipDeviceAttributeDeviceOverlap,                            // Device can concurrently copy memory and execute a kernel.
+                                                              // Deprecated. Use instead asyncEngineCount.
+  hipDeviceAttributeDirectManagedMemAccessFromHost,           // Host can directly access managed memory on
+                                                              // the device without migration
+  hipDeviceAttributeGlobalL1CacheSupported,                   // Device supports caching globals in L1
+  hipDeviceAttributeHostNativeAtomicSupported,                // Link between the device and the host supports native atomic operations
+  hipDeviceAttributeIntegrated,                               // Device is integrated GPU
+  hipDeviceAttributeIsMultiGpuBoard,                          // Multiple GPU devices.
+  hipDeviceAttributeKernelExecTimeout,                        // Run time limit for kernels executed on the device
+  hipDeviceAttributeL2CacheSize,                              // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  hipDeviceAttributeLocalL1CacheSupported,                    // caching locals in L1 is supported
+  hipDeviceAttributeLuid,                                     // 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeLuidDeviceNodeMask,                       // Luid device node mask. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeComputeCapabilityMajor,                   // Major compute capability version number.
+  hipDeviceAttributeManagedMemory,                            // Device supports allocating managed memory on this system
+  hipDeviceAttributeMaxBlocksPerMultiProcessor,               // Max block size per multiprocessor
+  hipDeviceAttributeMaxBlockDimX,                             // Max block size in width.
+  hipDeviceAttributeMaxBlockDimY,                             // Max block size in height.
+  hipDeviceAttributeMaxBlockDimZ,                             // Max block size in depth.
+  hipDeviceAttributeMaxGridDimX,                              // Max grid size  in width.
+  hipDeviceAttributeMaxGridDimY,                              // Max grid size  in height.
+  hipDeviceAttributeMaxGridDimZ,                              // Max grid size  in depth.
+  hipDeviceAttributeMaxSurface1D,                             // Maximum size of 1D surface.
+  hipDeviceAttributeMaxSurface1DLayered,                      // Cuda only. Maximum dimensions of 1D layered surface.
+  hipDeviceAttributeMaxSurface2D,                             // Maximum dimension (width, height) of 2D surface.
+  hipDeviceAttributeMaxSurface2DLayered,                      // Cuda only. Maximum dimensions of 2D layered surface.
+  hipDeviceAttributeMaxSurface3D,                             // Maximum dimension (width, height, depth) of 3D surface.
+  hipDeviceAttributeMaxSurfaceCubemap,                        // Cuda only. Maximum dimensions of Cubemap surface.
+  hipDeviceAttributeMaxSurfaceCubemapLayered,                 // Cuda only. Maximum dimension of Cubemap layered surface.
+  hipDeviceAttributeMaxTexture1DWidth,                        // Maximum size of 1D texture.
+  hipDeviceAttributeMaxTexture1DLayered,                      // Maximum dimensions of 1D layered texture.
+  hipDeviceAttributeMaxTexture1DLinear,                       // Maximum number of elements allocatable in a 1D linear texture.
+                                                              // Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
+  hipDeviceAttributeMaxTexture1DMipmap,                       // Maximum size of 1D mipmapped texture.
+  hipDeviceAttributeMaxTexture2DWidth,                        // Maximum dimension width of 2D texture.
+  hipDeviceAttributeMaxTexture2DHeight,                       // Maximum dimension hight of 2D texture.
+  hipDeviceAttributeMaxTexture2DGather,                       // Maximum dimensions of 2D texture if gather operations  performed.
+  hipDeviceAttributeMaxTexture2DLayered,                      // Maximum dimensions of 2D layered texture.
+  hipDeviceAttributeMaxTexture2DLinear,                       // Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
+  hipDeviceAttributeMaxTexture2DMipmap,                       // Maximum dimensions of 2D mipmapped texture.
+  hipDeviceAttributeMaxTexture3DWidth,                        // Maximum dimension width of 3D texture.
+  hipDeviceAttributeMaxTexture3DHeight,                       // Maximum dimension height of 3D texture.
+  hipDeviceAttributeMaxTexture3DDepth,                        // Maximum dimension depth of 3D texture.
+  hipDeviceAttributeMaxTexture3DAlt,                          // Maximum dimensions of alternate 3D texture.
+  hipDeviceAttributeMaxTextureCubemap,                        // Maximum dimensions of Cubemap texture
+  hipDeviceAttributeMaxTextureCubemapLayered,                 // Maximum dimensions of Cubemap layered texture.
+  hipDeviceAttributeMaxThreadsDim,                            // Maximum dimension of a block
+  hipDeviceAttributeMaxThreadsPerBlock,                       // Maximum number of threads per block.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,              // Maximum resident threads per multiprocessor.
+  hipDeviceAttributeMaxPitch,                                 // Maximum pitch in bytes allowed by memory copies
+  hipDeviceAttributeMemoryBusWidth,                           // Global memory bus width in bits.
+  hipDeviceAttributeMemoryClockRate,                          // Peak memory clock frequency in kilohertz.
+  hipDeviceAttributeComputeCapabilityMinor,                   // Minor compute capability version number.
+  hipDeviceAttributeMultiGpuBoardGroupID,                     // Unique ID of device group on the same multi-GPU board
+  hipDeviceAttributeMultiprocessorCount,                      // Number of multiprocessors on the device.
+  hipDeviceAttributeUnused1,                                  // Previously hipDeviceAttributeName
+  hipDeviceAttributePageableMemoryAccess,                     // Device supports coherently accessing pageable memory
+                                                              // without calling hipHostRegister on it
+  hipDeviceAttributePageableMemoryAccessUsesHostPageTables,   // Device accesses pageable memory via the host's page tables
+  hipDeviceAttributePciBusId,                                 // PCI Bus ID.
+  hipDeviceAttributePciDeviceId,                              // PCI Device ID.
+  hipDeviceAttributePciDomainID,                              // PCI Domain ID.
+  hipDeviceAttributePersistingL2CacheMaxSize,                 // Maximum l2 persisting lines capacity in bytes
+  hipDeviceAttributeMaxRegistersPerBlock,                     // 32-bit registers available to a thread block. This number is shared
+                                                              // by all thread blocks simultaneously resident on a multiprocessor.
+  hipDeviceAttributeMaxRegistersPerMultiprocessor,            // 32-bit registers available per block.
+  hipDeviceAttributeReservedSharedMemPerBlock,                // Shared memory reserved by CUDA driver per block.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,                  // Maximum shared memory available per block in bytes.
+  hipDeviceAttributeSharedMemPerBlockOptin,                   // Maximum shared memory per block usable by special opt in.
+  hipDeviceAttributeSharedMemPerMultiprocessor,               // Shared memory available per multiprocessor.
+  hipDeviceAttributeSingleToDoublePrecisionPerfRatio,         // Cuda only. Performance ratio of single precision to double precision.
+  hipDeviceAttributeStreamPrioritiesSupported,                // Whether to support stream priorities.
+  hipDeviceAttributeSurfaceAlignment,                         // Alignment requirement for surfaces
+  hipDeviceAttributeTccDriver,                                // Cuda only. Whether device is a Tesla device using TCC driver
+  hipDeviceAttributeTextureAlignment,                         // Alignment requirement for textures
+  hipDeviceAttributeTexturePitchAlignment,                    // Pitch alignment requirement for 2D texture references bound to pitched memory;
+  hipDeviceAttributeTotalConstantMemory,                      // Constant memory size in bytes.
+  hipDeviceAttributeTotalGlobalMem,                           // Global memory available on devicice.
+  hipDeviceAttributeUnifiedAddressing,                        // Cuda only. An unified address space shared with the host.
+  hipDeviceAttributeUnused2,                                  // Previously hipDeviceAttributeUuid
+  hipDeviceAttributeWarpSize,                                 // Warp size in threads.
+  hipDeviceAttributeMemoryPoolsSupported,                     // Device supports HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeVirtualMemoryManagementSupported,         // Device supports HIP virtual memory management
+  hipDeviceAttributeHostRegisterSupported,                    // Can device support host memory registration via hipHostRegister
+  hipDeviceAttributeMemoryPoolSupportedHandleTypes,           // Supported handle mask for HIP Stream Ordered Memory Allocator
 
-    hipDeviceAttributeCudaCompatibleEnd = 9999,
-    hipDeviceAttributeAmdSpecificBegin = 10000,
+  hipDeviceAttributeCudaCompatibleEnd = 9999,
+  hipDeviceAttributeAmdSpecificBegin = 10000,
 
-    hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the device-side "clock*"
-    hipDeviceAttributeUnused3,                                  ///< Previously hipDeviceAttributeArch
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         ///< Maximum Shared Memory PerMultiprocessor.
-    hipDeviceAttributeUnused4,                                  ///< Previously hipDeviceAttributeGcnArch
-    hipDeviceAttributeUnused5,                                  ///< Previously hipDeviceAttributeGcnArchName
-    hipDeviceAttributeHdpMemFlushCntl,                          ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeHdpRegFlushCntl,                          ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched functions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched grid dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched shared memories
-    hipDeviceAttributeIsLargeBar,                               ///< Whether it is LargeBar
-    hipDeviceAttributeAsicRevision,                             ///< Revision of the GPU in this device
-    hipDeviceAttributeCanUseStreamWaitValue,                    ///< '1' if Device supports hipStreamWaitValue32() and
-                                                                ///< hipStreamWaitValue64(), '0' otherwise.
-    hipDeviceAttributeImageSupport,                             ///< '1' if Device supports image, '0' otherwise.
-    hipDeviceAttributePhysicalMultiProcessorCount,              ///< All available physical compute
-                                                                ///< units for the device
-    hipDeviceAttributeFineGrainSupport,                         ///< '1' if Device supports fine grain, '0' otherwise
-    hipDeviceAttributeWallClockRate,                            ///< Constant frequency of wall clock in kilohertz.
+  hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  // Frequency in khz of the timer used by the device-side "clock*"
+  hipDeviceAttributeUnused3,                                  // Previously hipDeviceAttributeArch
+  hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         // Maximum Shared Memory PerMultiprocessor.
+  hipDeviceAttributeUnused4,                                  // Previously hipDeviceAttributeGcnArch
+  hipDeviceAttributeUnused5,                                  // Previously hipDeviceAttributeGcnArchName
+  hipDeviceAttributeHdpMemFlushCntl,                          // Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeHdpRegFlushCntl,                          // Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      // Supports cooperative launch on multiple devices with unmatched functions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   // Supports cooperative launch on multiple devices with unmatched grid dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  // Supports cooperative launch on multiple devices with unmatched block dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, // Supports cooperative launch on multiple devices with unmatched shared memories
+  hipDeviceAttributeIsLargeBar,                               // Whether it is LargeBar
+  hipDeviceAttributeAsicRevision,                             // Revision of the GPU in this device
+  hipDeviceAttributeCanUseStreamWaitValue,                    // '1' if Device supports hipStreamWaitValue32() and hipStreamWaitValue64(), '0' otherwise.
+  hipDeviceAttributeImageSupport,                             // '1' if Device supports image, '0' otherwise.
+  hipDeviceAttributePhysicalMultiProcessorCount,              // All available physical compute units for the device
+  hipDeviceAttributeFineGrainSupport,                         // '1' if Device supports fine grain, '0' otherwise
+  hipDeviceAttributeWallClockRate,                            // Constant frequency of wall clock in kilohertz.
+
+  hipDeviceAttributeAmdSpecificEnd = 19999,
+  hipDeviceAttributeVendorSpecificBegin = 20000,
+  // Extended attributes for vendors
 
-    hipDeviceAttributeAmdSpecificEnd = 19999,
-    hipDeviceAttributeVendorSpecificBegin = 20000,
-    // Extended attributes for vendors
 } hipDeviceAttribute_t;
 
 /**
  * hipDeviceArch_t
  *
  */
-typedef struct {
-    // 32-bit Atomics
-    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
-    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
-    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
-    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
-    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.
+typedef struct
+{
+  // 32-bit Atomics
+  unsigned hasGlobalInt32Atomics : 1;     // 32-bit integer atomics for global memory.
+  unsigned hasGlobalFloatAtomicExch : 1;  // 32-bit float atomic exch for global memory.
+  unsigned hasSharedInt32Atomics : 1;     // 32-bit integer atomics for shared memory.
+  unsigned hasSharedFloatAtomicExch : 1;  // 32-bit float atomic exch for shared memory.
+  unsigned hasFloatAtomicAdd : 1;         // 32-bit float atomic add in global and shared memory.
 
-    // 64-bit Atomics
-    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
-    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+  // 64-bit Atomics
+  unsigned hasGlobalInt64Atomics : 1;     // 64-bit integer atomics for global memory.
+  unsigned hasSharedInt64Atomics : 1;     // 64-bit integer atomics for shared memory.
 
-    // Doubles
-    unsigned hasDoubles : 1;  ///< Double-precision floating point.
+  // Doubles
+  unsigned hasDoubles : 1;                // Double-precision floating point.
 
-    // Warp cross-lane operations
-    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
-    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
-    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
-    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+  // Warp cross-lane operations
+  unsigned hasWarpVote : 1;               // Warp vote instructions (__any, __all).
+  unsigned hasWarpBallot : 1;             // Warp ballot instructions (__ballot).
+  unsigned hasWarpShuffle : 1;            // Warp shuffle operations. (__shfl_*).
+  unsigned hasFunnelShift : 1;            // Funnel two words into one with shift&mask caps.
 
-    // Sync
-    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
-    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+  // Sync
+  unsigned hasThreadFenceSystem : 1;      // __threadfence_system.
+  unsigned hasSyncThreadsExt : 1;         // __syncthreads_count, syncthreads_and, syncthreads_or.
+
+  // Misc
+  unsigned hasSurfaceFuncs : 1;           // Surface functions.
+  unsigned has3dGrid : 1;                 // Grid and group dims are 3D (rather than 2D).
+  unsigned hasDynamicParallelism : 1;     // Dynamic parallelism.
 
-    // Misc
-    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
-    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
-    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
 } hipDeviceArch_t;
 
-typedef struct hipUUID_t {
-    char bytes[16];
+typedef struct hipUUID_t
+{
+  char bytes[16];
+
 } hipUUID;
 
 
@@ -349,149 +333,127 @@ typedef struct hipUUID_t {
  * hipDeviceProp
  *
  */
-typedef struct hipDeviceProp_t {
-    char name[256];                   ///< Device name.
-    hipUUID uuid;                     ///< UUID of a device
-    char luid[8];                     ///< 8-byte unique identifier. Only valid on windows
-    unsigned int luidDeviceNodeMask;  ///< LUID node mask
-    size_t totalGlobalMem;            ///< Size of global memory region (in bytes).
-    size_t sharedMemPerBlock;         ///< Size of shared memory per block (in bytes).
-    int regsPerBlock;                 ///< Registers per block.
-    int warpSize;                     ///< Warp size.
-    size_t memPitch;                  ///< Maximum pitch in bytes allowed by memory copies
-                                      ///< pitched memory
-    int maxThreadsPerBlock;           ///< Max work items per work group or workgroup max size.
-    int maxThreadsDim[3];             ///< Max number of threads in each dimension (XYZ) of a block.
-    int maxGridSize[3];               ///< Max grid dimensions (XYZ).
-    int clockRate;                    ///< Max clock frequency of the multiProcessors in khz.
-    size_t totalConstMem;             ///< Size of shared constant memory region on the device
-                                      ///< (in bytes).
-    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    size_t textureAlignment;       ///< Alignment requirement for textures
-    size_t texturePitchAlignment;  ///< Pitch alignment requirement for texture references bound to
-    int deviceOverlap;             ///< Deprecated. Use asyncEngineCount instead
-    int multiProcessorCount;       ///< Number of multi-processors (compute units).
-    int kernelExecTimeoutEnabled;  ///< Run time limit for kernels executed on the device
-    int integrated;                ///< APU vs dGPU
-    int canMapHostMemory;          ///< Check whether HIP can map host memory
-    int computeMode;               ///< Compute mode.
-    int maxTexture1D;              ///< Maximum number of elements in 1D images
-    int maxTexture1DMipmap;        ///< Maximum 1D mipmap texture size
-    int maxTexture1DLinear;        ///< Maximum size for 1D textures bound to linear memory
-    int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
-    int maxTexture2DMipmap[2];  ///< Maximum number of elements in 2D array mipmap of images
-    int maxTexture2DLinear[3];  ///< Maximum 2D tex dimensions if tex are bound to pitched memory
-    int maxTexture2DGather[2];  ///< Maximum 2D tex dimensions if gather has to be performed
-    int maxTexture3D[3];  ///< Maximum dimensions (width, height, depth) of 3D images, in image
-                          ///< elements
-    int maxTexture3DAlt[3];           ///< Maximum alternate 3D texture dims
-    int maxTextureCubemap;            ///< Maximum cubemap texture dims
-    int maxTexture1DLayered[2];       ///< Maximum number of elements in 1D array images
-    int maxTexture2DLayered[3];       ///< Maximum number of elements in 2D array images
-    int maxTextureCubemapLayered[2];  ///< Maximum cubemaps layered texture dims
-    int maxSurface1D;                 ///< Maximum 1D surface size
-    int maxSurface2D[2];              ///< Maximum 2D surface size
-    int maxSurface3D[3];              ///< Maximum 3D surface size
-    int maxSurface1DLayered[2];       ///< Maximum 1D layered surface size
-    int maxSurface2DLayered[3];       ///< Maximum 2D layared surface size
-    int maxSurfaceCubemap;            ///< Maximum cubemap surface size
-    int maxSurfaceCubemapLayered[2];  ///< Maximum cubemap layered surface size
-    size_t surfaceAlignment;          ///< Alignment requirement for surface
-    int concurrentKernels;         ///< Device can possibly execute multiple kernels concurrently.
-    int ECCEnabled;                ///< Device has ECC support enabled
-    int pciBusID;                  ///< PCI Bus ID.
-    int pciDeviceID;               ///< PCI Device ID.
-    int pciDomainID;               ///< PCI Domain ID
-    int tccDriver;                 ///< 1:If device is Tesla device using TCC driver, else 0
-    int asyncEngineCount;          ///< Number of async engines
-    int unifiedAddressing;         ///< Does device and host share unified address space
-    int memoryClockRate;           ///< Max global memory clock frequency in khz.
-    int memoryBusWidth;            ///< Global memory bus width in bits.
-    int l2CacheSize;               ///< L2 cache size.
-    int persistingL2CacheMaxSize;  ///< Device's max L2 persisting lines in bytes
-    int maxThreadsPerMultiProcessor;    ///< Maximum resident threads per multi-processor.
-    int streamPrioritiesSupported;      ///< Device supports stream priority
-    int globalL1CacheSupported;         ///< Indicates globals are cached in L1
-    int localL1CacheSupported;          ///< Locals are cahced in L1
-    size_t sharedMemPerMultiprocessor;  ///< Amount of shared memory available per multiprocessor.
-    int regsPerMultiprocessor;          ///< registers available per multiprocessor
-    int managedMemory;         ///< Device supports allocating managed memory on this system
-    int isMultiGpuBoard;       ///< 1 if device is on a multi-GPU board, 0 if not.
-    int multiGpuBoardGroupID;  ///< Unique identifier for a group of devices on same multiboard GPU
-    int hostNativeAtomicSupported;         ///< Link between host and device supports native atomics
-    int singleToDoublePrecisionPerfRatio;  ///< Deprecated. CUDA only.
-    int pageableMemoryAccess;              ///< Device supports coherently accessing pageable memory
-                                           ///< without calling hipHostRegister on it
-    int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
-                                  ///< the CPU
-    int computePreemptionSupported;         ///< Is compute preemption supported on the device
-    int canUseHostPointerForRegisteredMem;  ///< Device can access host registered memory with same
-                                            ///< address as the host
-    int cooperativeLaunch;                  ///< HIP device supports cooperative launch
-    int cooperativeMultiDeviceLaunch;       ///< HIP device supports cooperative launch on multiple
-                                            ///< devices
-    size_t
-        sharedMemPerBlockOptin;  ///< Per device m ax shared mem per block usable by special opt in
-    int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
-                                                 ///< page tables
-    int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
-                                         ///< without migration
-    int maxBlocksPerMultiProcessor;      ///< Max number of blocks on CU
-    int accessPolicyMaxWindowSize;       ///< Max value of access policy window
-    size_t reservedSharedMemPerBlock;    ///< Shared memory reserved by driver per block
-    int hostRegisterSupported;           ///< Device supports hipHostRegister
-    int sparseHipArraySupported;         ///< Indicates if device supports sparse hip arrays
-    int hostRegisterReadOnlySupported;   ///< Device supports using the hipHostRegisterReadOnly flag
-                                         ///< with hipHostRegistger
-    int timelineSemaphoreInteropSupported;  ///< Indicates external timeline semaphore support
-    int memoryPoolsSupported;  ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
-    int gpuDirectRDMASupported;                    ///< Indicates device support of RDMA APIs
-    unsigned int gpuDirectRDMAFlushWritesOptions;  ///< Bitmask to be interpreted according to
-                                                   ///< hipFlushGPUDirectRDMAWritesOptions
-    int gpuDirectRDMAWritesOrdering;               ///< value of hipGPUDirectRDMAWritesOrdering
-    unsigned int
-        memoryPoolSupportedHandleTypes;  ///< Bitmask of handle types support with mempool based IPC
-    int deferredMappingHipArraySupported;  ///< Device supports deferred mapping HIP arrays and HIP
-                                           ///< mipmapped arrays
-    int ipcEventSupported;                 ///< Device supports IPC events
-    int clusterLaunch;                     ///< Device supports cluster launch
-    int unifiedFunctionPointers;           ///< Indicates device supports unified function pointers
-    int reserved[63];                      ///< CUDA Reserved.
+typedef struct hipDeviceProp_t
+{
+  char name[256];                                // Device name.
+  hipUUID uuid;                                  // UUID of a device
+  char luid[8];                                  // 8-byte unique identifier. Only valid on windows
+  unsigned int luidDeviceNodeMask;               // LUID node mask
+  size_t totalGlobalMem;                         // Size of global memory region (in bytes).
+  size_t sharedMemPerBlock;                      // Size of shared memory per block (in bytes).
+  int regsPerBlock;                              // Registers per block.
+  int warpSize;                                  // Warp size.
+  size_t memPitch;                               // Maximum pitch in bytes allowed by memory copies pitched memory
+  int maxThreadsPerBlock;                        // Max work items per work group or workgroup max size.
+  int maxThreadsDim[3];                          // Max number of threads in each dimension (XYZ) of a block.
+  int maxGridSize[3];                            // Max grid dimensions (XYZ).
+  int clockRate;                                 // Max clock frequency of the multiProcessors in khz.
+  size_t totalConstMem;                          // Size of shared constant memory region on the device (in bytes).
+  int major;                                     // Major compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  int minor;                                     // Minor compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  size_t textureAlignment;                       // Alignment requirement for textures
+  size_t texturePitchAlignment;                  // Pitch alignment requirement for texture references bound to
+  int deviceOverlap;                             // Deprecated. Use asyncEngineCount instead
+  int multiProcessorCount;                       // Number of multi-processors (compute units).
+  int kernelExecTimeoutEnabled;                  // Run time limit for kernels executed on the device
+  int integrated;                                // APU vs dGPU
+  int canMapHostMemory;                          // Check whether HIP can map host memory
+  int computeMode;                               // Compute mode.
+  int maxTexture1D;                              // Maximum number of elements in 1D images
+  int maxTexture1DMipmap;                        // Maximum 1D mipmap texture size
+  int maxTexture1DLinear;                        // Maximum size for 1D textures bound to linear memory
+  int maxTexture2D[2];                           // Maximum dimensions (width, height) of 2D images, in image elements
+  int maxTexture2DMipmap[2];                     // Maximum number of elements in 2D array mipmap of images
+  int maxTexture2DLinear[3];                     // Maximum 2D tex dimensions if tex are bound to pitched memory
+  int maxTexture2DGather[2];                     // Maximum 2D tex dimensions if gather has to be performed
+  int maxTexture3D[3];                           // Maximum dimensions (width, height, depth) of 3D images, in image elements
+  int maxTexture3DAlt[3];                        // Maximum alternate 3D texture dims
+  int maxTextureCubemap;                         // Maximum cubemap texture dims
+  int maxTexture1DLayered[2];                    // Maximum number of elements in 1D array images
+  int maxTexture2DLayered[3];                    // Maximum number of elements in 2D array images
+  int maxTextureCubemapLayered[2];               // Maximum cubemaps layered texture dims
+  int maxSurface1D;                              // Maximum 1D surface size
+  int maxSurface2D[2];                           // Maximum 2D surface size
+  int maxSurface3D[3];                           // Maximum 3D surface size
+  int maxSurface1DLayered[2];                    // Maximum 1D layered surface size
+  int maxSurface2DLayered[3];                    // Maximum 2D layared surface size
+  int maxSurfaceCubemap;                         // Maximum cubemap surface size
+  int maxSurfaceCubemapLayered[2];               // Maximum cubemap layered surface size
+  size_t surfaceAlignment;                       // Alignment requirement for surface
+  int concurrentKernels;                         // Device can possibly execute multiple kernels concurrently.
+  int ECCEnabled;                                // Device has ECC support enabled
+  int pciBusID;                                  // PCI Bus ID.
+  int pciDeviceID;                               // PCI Device ID.
+  int pciDomainID;                               // PCI Domain ID
+  int tccDriver;                                 // 1:If device is Tesla device using TCC driver, else 0
+  int asyncEngineCount;                          // Number of async engines
+  int unifiedAddressing;                         // Does device and host share unified address space
+  int memoryClockRate;                           // Max global memory clock frequency in khz.
+  int memoryBusWidth;                            // Global memory bus width in bits.
+  int l2CacheSize;                               // L2 cache size.
+  int persistingL2CacheMaxSize;                  // Device's max L2 persisting lines in bytes
+  int maxThreadsPerMultiProcessor;               // Maximum resident threads per multi-processor.
+  int streamPrioritiesSupported;                 // Device supports stream priority
+  int globalL1CacheSupported;                    // Indicates globals are cached in L1
+  int localL1CacheSupported;                     // Locals are cahced in L1
+  size_t sharedMemPerMultiprocessor;             // Amount of shared memory available per multiprocessor.
+  int regsPerMultiprocessor;                     // registers available per multiprocessor
+  int managedMemory;                             // Device supports allocating managed memory on this system
+  int isMultiGpuBoard;                           // 1 if device is on a multi-GPU board, 0 if not.
+  int multiGpuBoardGroupID;                      // Unique identifier for a group of devices on same multiboard GPU
+  int hostNativeAtomicSupported;                 // Link between host and device supports native atomics
+  int singleToDoublePrecisionPerfRatio;          // Deprecated. CUDA only.
+  int pageableMemoryAccess;                      // Device supports coherently accessing pageable memory
+                                                 // without calling hipHostRegister on it
+  int concurrentManagedAccess;                   // Device can coherently access managed memory concurrently with the CPU
+  int computePreemptionSupported;                // Is compute preemption supported on the device
+  int canUseHostPointerForRegisteredMem;         // Device can access host registered memory with same address as the host
+  int cooperativeLaunch;                         // HIP device supports cooperative launch
+  int cooperativeMultiDeviceLaunch;              // HIP device supports cooperative launch on multiple devices
+  size_t sharedMemPerBlockOptin;                 // Per device m ax shared mem per block usable by special opt in
+  int pageableMemoryAccessUsesHostPageTables;    // Device accesses pageable memory via the host's page tables
+  int directManagedMemAccessFromHost;            // Host can directly access managed memory on the device without migration
+  int maxBlocksPerMultiProcessor;                // Max number of blocks on CU
+  int accessPolicyMaxWindowSize;                 // Max value of access policy window
+  size_t reservedSharedMemPerBlock;              // Shared memory reserved by driver per block
+  int hostRegisterSupported;                     // Device supports hipHostRegister
+  int sparseHipArraySupported;                   // Indicates if device supports sparse hip arrays
+  int hostRegisterReadOnlySupported;             // Device supports using the hipHostRegisterReadOnly flag with hipHostRegistger
+  int timelineSemaphoreInteropSupported;         // Indicates external timeline semaphore support
+  int memoryPoolsSupported;                      // Indicates if device supports hipMallocAsync and hipMemPool APIs
+  int gpuDirectRDMASupported;                    // Indicates device support of RDMA APIs
+  unsigned int gpuDirectRDMAFlushWritesOptions;  // Bitmask to be interpreted according to hipFlushGPUDirectRDMAWritesOptions
+  int gpuDirectRDMAWritesOrdering;               // value of hipGPUDirectRDMAWritesOrdering
+  unsigned int memoryPoolSupportedHandleTypes;   // Bitmask of handle types support with mempool based IPC
+  int deferredMappingHipArraySupported;          // Device supports deferred mapping HIP arrays and HIP mipmapped arrays
+  int ipcEventSupported;                         // Device supports IPC events
+  int clusterLaunch;                             // Device supports cluster launch
+  int unifiedFunctionPointers;                   // Indicates device supports unified function pointers
+  int reserved[63];                              // CUDA Reserved.
 
-    int hipReserved[32];  ///< Reserved for adding new entries for HIP/CUDA.
+  int hipReserved[32];                           // Reserved for adding new entries for HIP/CUDA.
+
+  /* HIP Only struct members */
+  char gcnArchName[256];                         // AMD GCN Arch Name. HIP Only.
+  size_t maxSharedMemoryPerMultiProcessor;       // Maximum Shared Memory Per CU. HIP Only.
+  int clockInstructionRate;                      // Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP.
+  hipDeviceArch_t arch;                          // Architectural feature flags.  New for HIP.
+  unsigned int* hdpMemFlushCntl;                 // Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+  unsigned int* hdpRegFlushCntl;                 // Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+  int cooperativeMultiDeviceUnmatchedFunc;       // HIP device supports cooperative launch on multiple devices with unmatched functions
+  int cooperativeMultiDeviceUnmatchedGridDim;    // HIP device supports cooperative launch on multiple devices with unmatched grid dimensions
+  int cooperativeMultiDeviceUnmatchedBlockDim;   // HIP device supports cooperative launch on multiple devices with unmatched block dimensions
+  int cooperativeMultiDeviceUnmatchedSharedMem;  // HIP device supports cooperative launch on multiple devices with unmatched shared memories
+  int isLargeBar;                                // 1: if it is a large PCI bar device, else 0
+  int asicRevision;                              // Revision of the GPU in this device
 
-    /* HIP Only struct members */
-    char gcnArchName[256];                    ///< AMD GCN Arch Name. HIP Only.
-    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per CU. HIP Only.
-    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
-                               ///< instructions.  New for HIP.
-    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
-    unsigned int* hdpMemFlushCntl;            ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
-    unsigned int* hdpRegFlushCntl;            ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
-    int cooperativeMultiDeviceUnmatchedFunc;  ///< HIP device supports cooperative launch on
-                                              ///< multiple
-                                              /// devices with unmatched functions
-    int cooperativeMultiDeviceUnmatchedGridDim;    ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched grid dimensions
-    int cooperativeMultiDeviceUnmatchedBlockDim;   ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched block dimensions
-    int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched shared memories
-    int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
-    int asicRevision;                              ///< Revision of the GPU in this device
 } hipDeviceProp_t;
 
 //Flags that can be used with hipStreamCreateWithFlags.
 /** Default stream creation flags. These are used with hipStreamCreate().*/
-#define hipStreamDefault  0x00
+#define hipStreamDefault 0x00
 
 /** Stream does not implicitly synchronize with null stream.*/
 #define hipStreamNonBlocking 0x01
@@ -511,46 +473,48 @@ typedef struct hipDeviceProp_t {
 
 /** Use a device-scope release when recording this event. This flag is useful to obtain more
  * precise timings of commands between events.  The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToDevice  0x40000000
+#define hipEventReleaseToDevice 0x40000000
 
 /** Use a system-scope release when recording this event. This flag is useful to make
  * non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToSystem  0x80000000
+#define hipEventReleaseToSystem 0x80000000
 
 /** Automatically select between Spin and Yield.*/
 #define hipDeviceScheduleAuto 0x0
 
 /** Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may
  * consume more power.*/
-#define hipDeviceScheduleSpin  0x1
+#define hipDeviceScheduleSpin 0x1
 
 /** Yield the CPU to the operating system when waiting. May increase latency, but lowers power
  * and is friendlier to other threads in the system.*/
-#define hipDeviceScheduleYield  0x2
+#define hipDeviceScheduleYield 0x2
 #define hipDeviceScheduleBlockingSync 0x4
 #define hipDeviceScheduleMask 0x7
 #define hipDeviceMapHost 0x8
 #define hipDeviceLmemResizeToMax 0x16
 
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
+typedef enum hipJitOption
+{
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions
+
 } hipJitOption;
 
 // stop: hip_runtime_api.h
@@ -563,11 +527,17 @@ typedef enum hipJitOption {
 
 #define HIP_API_CALL HIPAPI
 
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXCREATE)              (hipCtx_t *, unsigned int, hipDevice_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXDESTROY)             (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPOPCURRENT)          (hipCtx_t *);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPUSHCURRENT)         (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSETCURRENT)          (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSYNCHRONIZE)         (void);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETATTRIBUTE)     (int *, hipDeviceAttribute_t, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETCOUNT)         (int *);
@@ -575,7 +545,8 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGET)              (hipDevice_t *,
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETNAME)          (char *, int, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICETOTALMEM)         (size_t *, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDRIVERGETVERSION)       (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATEWITHFLAGS)   (hipEvent_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTDESTROY)           (hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTELAPSEDTIME)       (float *, hipEvent_t, hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTRECORD)            (hipEvent_t, hipStream_t);
@@ -603,21 +574,30 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEGETGLOBAL)        (hipDeviceptr_t
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULELOADDATAEX)       (hipModule_t *, const void *, unsigned int, hipJitOption *, void **);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEUNLOAD)           (hipModule_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPRUNTIMEGETVERSION)      (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICE)              (hipDevice_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICEFLAGS)         (unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATEWITHFLAGS)  (hipStream_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMDESTROY)          (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE)      (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPGETDEVICEPROPERTIES)    (hipDeviceProp_t *, hipDevice_t);
-typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)    (int *, hipFunction_t, int, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)  (int *, hipFunction_t, int, size_t);
 
 typedef struct hc_hip_lib
 {
   hc_dynlib_t lib;
 
+  // deprecated
   HIP_HIPCTXCREATE              hipCtxCreate;
+  // deprecated
   HIP_HIPCTXDESTROY             hipCtxDestroy;
+  // deprecated
   HIP_HIPCTXPOPCURRENT          hipCtxPopCurrent;
+  // deprecated
   HIP_HIPCTXPUSHCURRENT         hipCtxPushCurrent;
+  // deprecated
   HIP_HIPCTXSETCURRENT          hipCtxSetCurrent;
+  // deprecated
   HIP_HIPCTXSYNCHRONIZE         hipCtxSynchronize;
   HIP_HIPDEVICEGETATTRIBUTE     hipDeviceGetAttribute;
   HIP_HIPDEVICEGETCOUNT         hipDeviceGetCount;
@@ -626,6 +606,7 @@ typedef struct hc_hip_lib
   HIP_HIPDEVICETOTALMEM         hipDeviceTotalMem;
   HIP_HIPDRIVERGETVERSION       hipDriverGetVersion;
   HIP_HIPEVENTCREATE            hipEventCreate;
+  HIP_HIPEVENTCREATEWITHFLAGS   hipEventCreateWithFlags;
   HIP_HIPEVENTDESTROY           hipEventDestroy;
   HIP_HIPEVENTELAPSEDTIME       hipEventElapsedTime;
   HIP_HIPEVENTRECORD            hipEventRecord;
@@ -653,62 +634,75 @@ typedef struct hc_hip_lib
   HIP_HIPMODULELOADDATAEX       hipModuleLoadDataEx;
   HIP_HIPMODULEUNLOAD           hipModuleUnload;
   HIP_HIPRUNTIMEGETVERSION      hipRuntimeGetVersion;
+  HIP_HIPSETDEVICE              hipSetDevice;
+  HIP_HIPSETDEVICEFLAGS         hipSetDeviceFlags;
   HIP_HIPSTREAMCREATE           hipStreamCreate;
+  HIP_HIPSTREAMCREATEWITHFLAGS  hipStreamCreateWithFlags;
   HIP_HIPSTREAMDESTROY          hipStreamDestroy;
   HIP_HIPSTREAMSYNCHRONIZE      hipStreamSynchronize;
   HIP_HIPGETDEVICEPROPERTIES    hipGetDeviceProperties;
-  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR 	hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR  hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
 
 } hc_hip_lib_t;
 
 typedef hc_hip_lib_t HIP_PTR;
 
-int  hip_init                  (void *hashcat_ctx);
-void hip_close                 (void *hashcat_ctx);
+int  hip_init                   (void *hashcat_ctx);
+void hip_close                  (void *hashcat_ctx);
 
-int hc_hipCtxCreate            (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
-int hc_hipCtxDestroy           (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxPopCurrent        (void *hashcat_ctx, hipCtx_t *pctx);
-int hc_hipCtxPushCurrent       (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSetCurrent        (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSynchronize       (void *hashcat_ctx);
-int hc_hipDeviceGet            (void *hashcat_ctx, hipDevice_t *device, int ordinal);
-int hc_hipDeviceGetAttribute   (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
-int hc_hipDeviceGetCount       (void *hashcat_ctx, int *count);
-int hc_hipDeviceGetName        (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
-int hc_hipDeviceTotalMem       (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
-int hc_hipDriverGetVersion     (void *hashcat_ctx, int *driverVersion);
-int hc_hipEventCreate          (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
-int hc_hipEventDestroy         (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventElapsedTime     (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
-int hc_hipEventQuery           (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventRecord          (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
-int hc_hipEventSynchronize     (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipFuncGetAttribute     (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
-int hc_hipInit                 (void *hashcat_ctx, unsigned int Flags);
-int hc_hipLaunchKernel         (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
-int hc_hipMemAlloc             (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
-int hc_hipMemFree              (void *hashcat_ctx, hipDeviceptr_t dptr);
-int hc_hipMemGetInfo           (void *hashcat_ctx, size_t *free, size_t *total);
-int hc_hipMemcpyDtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
-int hc_hipMemcpyDtoH           (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
-int hc_hipMemcpyHtoD           (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
-int hc_hipMemsetD32            (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
-int hc_hipMemsetD8             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
-int hc_hipMemcpyDtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyDtoHAsync      (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyHtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemsetD32Async       (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
-int hc_hipMemsetD8Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
-int hc_hipModuleGetFunction    (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
-int hc_hipModuleGetGlobal      (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
-int hc_hipModuleLoadDataEx     (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
-int hc_hipModuleUnload         (void *hashcat_ctx, hipModule_t hmod);
-int hc_hipRuntimeGetVersion    (void *hashcat_ctx, int *runtimeVersion);
-int hc_hipStreamCreate         (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags);
-int hc_hipStreamDestroy        (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipStreamSynchronize    (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipGetDeviceProperties  (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxCreate             (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxDestroy            (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxPopCurrent         (void *hashcat_ctx, hipCtx_t *pctx);
+  // deprecated
+int hc_hipCtxPushCurrent        (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSetCurrent         (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSynchronize        (void *hashcat_ctx);
+int hc_hipDeviceGet             (void *hashcat_ctx, hipDevice_t *device, int ordinal);
+int hc_hipDeviceGetAttribute    (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+int hc_hipDeviceGetCount        (void *hashcat_ctx, int *count);
+int hc_hipDeviceGetName         (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
+int hc_hipDeviceTotalMem        (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
+int hc_hipDriverGetVersion      (void *hashcat_ctx, int *driverVersion);
+int hc_hipEventCreate           (void *hashcat_ctx, hipEvent_t *phEvent);
+int hc_hipEventCreateWithFlags  (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
+int hc_hipEventDestroy          (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventElapsedTime      (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+int hc_hipEventQuery            (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventRecord           (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
+int hc_hipEventSynchronize      (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipFuncGetAttribute      (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
+int hc_hipInit                  (void *hashcat_ctx, unsigned int Flags);
+int hc_hipLaunchKernel          (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+int hc_hipMemAlloc              (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
+int hc_hipMemFree               (void *hashcat_ctx, hipDeviceptr_t dptr);
+int hc_hipMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_hipMemcpyDtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyDtoH            (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyHtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+int hc_hipMemsetD32             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
+int hc_hipMemsetD8              (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
+int hc_hipMemcpyDtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyHtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemsetD32Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
+int hc_hipMemsetD8Async         (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
+int hc_hipModuleGetFunction     (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+int hc_hipModuleGetGlobal       (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+int hc_hipModuleLoadDataEx      (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
+int hc_hipModuleUnload          (void *hashcat_ctx, hipModule_t hmod);
+int hc_hipRuntimeGetVersion     (void *hashcat_ctx, int *runtimeVersion);
+int hc_hipSetDevice             (void *hashcat_ctx, hipDevice_t dev);
+int hc_hipSetDeviceFlags        (void *hashcat_ctx, unsigned int flags);
+int hc_hipStreamCreate          (void *hashcat_ctx, hipStream_t *phStream);
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int flags);
+int hc_hipStreamDestroy         (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipStreamSynchronize     (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipGetDeviceProperties   (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
 int hc_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor (void *hashcat_ctx, int *numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
 
 #endif // HC_EXT_HIP_H
diff --git a/src/autotune.c b/src/autotune.c
index 7938fb259..e8f704599 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -679,7 +679,7 @@ HC_API_CALL void *thread_autotune (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   // check for autotune failure
@@ -695,11 +695,6 @@ HC_API_CALL void *thread_autotune (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   return NULL;
 }
 
diff --git a/src/backend.c b/src/backend.c
index 6e8fe4aad..6e84eed34 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -993,7 +993,7 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
@@ -1059,11 +1059,6 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
-  }
-
   return 0;
 }
 
@@ -1082,13 +1077,11 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -1122,13 +1115,11 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
     if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -5937,24 +5928,24 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
       device_param->has_prmt  = (sm >= 20) ? true : false;
       device_param->has_shfw  = (sm >= 70) ? true : false;
 
+      // one-time init cuda context
+
+      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       // device_available_mem
 
-      CUcontext cuda_context;
-
-      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -5967,14 +5958,7 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_cuCtxPopCurrent (hashcat_ctx, &cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1)
+      if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1)
       {
         device_param->skipped = true;
 
@@ -6440,24 +6424,24 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
       device_param->has_prmt  = false;
       device_param->has_shfw  = true; // always reports false : prop.arch.hasFunnelShift;
 
+      // one-time init hip context
+
+      if (hc_hipSetDeviceFlags (hashcat_ctx, hipDeviceScheduleBlockingSync) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       // device_available_mem
 
-      hipCtx_t hip_context;
-
-      if (hc_hipCtxCreate (hashcat_ctx, &hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -6470,20 +6454,6 @@ static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virth
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxDestroy (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       #if defined (__linux__)
       if (strchr (folder_config->cpath_real, ' ') != NULL)
       {
@@ -8623,13 +8593,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init metal command-queue
 
-      mtl_command_queue command_queue;
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &command_queue) == -1)
+      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8678,11 +8644,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
           u8 tmp_host[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
         }
 
         device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -8707,8 +8673,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         hcfree (tmp_device);
       }
 
-      hc_mtlReleaseCommandQueue (hashcat_ctx, command_queue);
-
       if (device_param->device_host_unified_memory == 1)
       {
         // so, we actually have only half the memory because we need the same buffers on host side
@@ -8734,11 +8698,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create context for each device
-       */
-
-      cl_context context;
+      // one-time init opencl context
 
       /*
       cl_context_properties properties[3];
@@ -8747,10 +8707,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       properties[1] = (cl_context_properties) device_param->opencl_platform;
       properties[2] = 0;
 
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
+      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
       */
 
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1)
+      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
       {
         device_param->skipped = true;
 
@@ -8760,13 +8720,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init open command-queue
 
-      cl_command_queue command_queue;
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1)
+      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8781,17 +8737,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
       {
         #define RUN_INSTRUCTION_CHECKS() \
-          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
 
         if (backend_devices_idx > 0)
         {
@@ -8979,7 +8935,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-              tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+              tmp_device[c] = ocl->clCreateBuffer (device_param->opencl_context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
 
               if (CL_err != CL_SUCCESS)
               {
@@ -8992,11 +8948,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               u8 tmp_host[8];
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
             }
 
             device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -9008,24 +8964,28 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
             // clean up
 
+            int r = 0;
+
             for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
             {
               if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
 
               if (tmp_device[c] != NULL)
               {
-                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
+                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) r = -1;
               }
             }
 
             hcfree (tmp_device);
+
+            if (r == -1)
+            {
+              // return -1 here is blocking, to be better evaluated
+              //return -1;
+            }
           }
         }
 
-        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
-
-        hc_clReleaseContext (hashcat_ctx, context);
-
         if (device_param->device_host_unified_memory == 1)
         {
           // so, we actually have only half the memory because we need the same buffers on host side
@@ -9065,23 +9025,61 @@ void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
     hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
   }
 
+  // one-time release context/command-queue from all runtimes
+
   for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
     hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     hcfree (device_param->device_name);
 
+    if (device_param->is_cuda == true)
+    {
+      if (device_param->cuda_context)
+      {
+        hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+
+        device_param->cuda_context = NULL;
+      }
+    }
+
+    if (device_param->is_hip == true)
+    {
+      hcfree (device_param->gcnArchName);
+    }
+
+    #if defined (__APPLE__)
+    if (device_param->is_metal == true)
+    {
+      if (device_param->metal_command_queue)
+      {
+        hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+
+        device_param->metal_command_queue = NULL;
+      }
+    }
+    #endif
+
     if (device_param->is_opencl == true)
     {
       hcfree (device_param->opencl_driver_version);
       hcfree (device_param->opencl_device_version);
       hcfree (device_param->opencl_device_c_version);
       hcfree (device_param->opencl_device_vendor);
-    }
 
-    if (device_param->is_hip == true)
-    {
-      hcfree (device_param->gcnArchName);
+      if (device_param->opencl_command_queue)
+      {
+        hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+
+        device_param->opencl_command_queue = NULL;
+      }
+
+      if (device_param->opencl_context)
+      {
+        hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+
+        device_param->opencl_context = NULL;
+      }
     }
   }
 
@@ -10730,93 +10728,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     }
     #endif
 
-    /**
-     * create context for each device
-     */
-
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxCreate (hashcat_ctx, &device_param->hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    #if defined (__APPLE__)
-    if (device_param->is_metal == true)
-    {
-      /**
-       * create command-queue
-       */
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-    #endif
-
-    if (device_param->is_opencl == true)
-    {
-      /*
-      cl_context_properties properties[3];
-
-      properties[0] = CL_CONTEXT_PLATFORM;
-      properties[1] = (cl_context_properties) device_param->opencl_platform;
-      properties[2] = 0;
-
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
-      */
-
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      /**
-       * create command-queue
-       */
-
-      // not supported with NV
-      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
+    // re-using context/command-queue, there is no need to re-initialize them
 
     /**
      * create stream for CUDA devices
@@ -10824,6 +10736,13 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       if (hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT) == -1)
       {
         device_param->skipped = true;
@@ -10838,7 +10757,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipStreamCreate (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipStreamCreateWithFlags (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
       {
         device_param->skipped = true;
 
@@ -10880,21 +10806,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
       {
         device_param->skipped = true;
 
@@ -16927,16 +16853,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
     hardware_power_all += hardware_power_max;
 
     EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_POST, &backend_devices_idx, sizeof (int));
@@ -17058,7 +16974,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->cuda_module_amp)       hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_amp);
       if (device_param->cuda_module_shared)    hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_shared);
 
-      if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+      //if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
 
       device_param->cuda_d_pws_buf            = 0;
       device_param->cuda_d_pws_amp_buf        = 0;
@@ -17135,7 +17051,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->cuda_module_amp           = NULL;
       device_param->cuda_module_shared        = NULL;
 
-      device_param->cuda_context              = NULL;
+      //device_param->cuda_context              = NULL;
     }
 
     if (device_param->is_hip == true)
@@ -17189,8 +17105,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->hip_module_amp)       hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_amp);
       if (device_param->hip_module_shared)    hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_shared);
 
-      if (device_param->hip_context)          hc_hipCtxDestroy (hashcat_ctx, device_param->hip_context);
-
       device_param->hip_d_pws_buf            = 0;
       device_param->hip_d_pws_amp_buf        = 0;
       device_param->hip_d_pws_comp_buf       = 0;
@@ -17265,8 +17179,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->hip_module_mp            = NULL;
       device_param->hip_module_amp           = NULL;
       device_param->hip_module_shared        = NULL;
-
-      device_param->hip_context              = NULL;
     }
 
     #if defined (__APPLE__)
@@ -17341,7 +17253,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->metal_library_amp)            hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_amp);
       if (device_param->metal_library_shared)         hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_shared);
 
-      if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+      //if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
 
       //if (device_param->metal_device)                 hc_mtlReleaseDevice (hashcat_ctx, device_param->metal_device);
 
@@ -17411,7 +17323,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->metal_library_mp             = NULL;
       device_param->metal_library_amp            = NULL;
       device_param->metal_library_shared         = NULL;
-      device_param->metal_command_queue          = NULL;
+      //device_param->metal_command_queue          = NULL;
       //device_param->metal_device                 = NULL;
     }
     #endif // __APPLE__
@@ -17487,9 +17399,9 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->opencl_program_amp)      hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_amp);
       if (device_param->opencl_program_shared)   hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_shared);
 
-      if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+      //if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
 
-      if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+      //if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
 
       device_param->opencl_d_pws_buf           = NULL;
       device_param->opencl_d_pws_amp_buf       = NULL;
@@ -17557,8 +17469,8 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->opencl_program_mp          = NULL;
       device_param->opencl_program_amp         = NULL;
       device_param->opencl_program_shared      = NULL;
-      device_param->opencl_command_queue       = NULL;
-      device_param->opencl_context             = NULL;
+      //device_param->opencl_command_queue       = NULL;
+      //device_param->opencl_context             = NULL;
     }
 
     device_param->h_tmps              = NULL;
diff --git a/src/dispatch.c b/src/dispatch.c
index ad7c9851b..9ac171f31 100644
--- a/src/dispatch.c
+++ b/src/dispatch.c
@@ -381,7 +381,7 @@ HC_API_CALL void *thread_calc_stdin (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc_stdin (hashcat_ctx, device_param) == -1)
@@ -396,11 +396,6 @@ HC_API_CALL void *thread_calc_stdin (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
@@ -1685,7 +1680,7 @@ HC_API_CALL void *thread_calc (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc (hashcat_ctx, device_param) == -1)
@@ -1700,11 +1695,6 @@ HC_API_CALL void *thread_calc (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
diff --git a/src/ext_hip.c b/src/ext_hip.c
index 4a26a7e9f..9e7eb00ab 100644
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@@ -115,51 +115,55 @@ int hip_init (void *hashcat_ctx)
   // a good reference is cuda.h itself
   // this needs to be verified for each new cuda release
 
-  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,               HIP_HIPCTXCREATE,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,              HIP_HIPCTXDESTROY,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,           HIP_HIPCTXPOPCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,          HIP_HIPCTXPUSHCURRENT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,           HIP_HIPCTXSETCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,          HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,               HIP_HIPDEVICEGET,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,      HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,          HIP_HIPDEVICEGETCOUNT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,           HIP_HIPDEVICEGETNAME,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,          HIP_HIPDEVICETOTALMEM,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,        HIP_HIPDRIVERGETVERSION,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreateWithFlags,    HIP_HIPEVENTCREATE,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,            HIP_HIPEVENTDESTROY,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,        HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,         HIP_HIPGETERRORNAME,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,       HIP_HIPGETERRORSTRING,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,              HIP_HIPMEMCPYDTOD,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,              HIP_HIPMEMCPYDTOH,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,              HIP_HIPMEMCPYHTOD,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,               HIP_HIPMEMSETD32,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                HIP_HIPMEMSETD8,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,         HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,         HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,          HIP_HIPMEMSETD32ASYNC,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,           HIP_HIPMEMSETD8ASYNC,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,            HIP_HIPMODULEUNLOAD,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,       HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,     HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,    hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,     HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,                 HIP_HIPCTXCREATE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,                HIP_HIPCTXDESTROY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,             HIP_HIPCTXPOPCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,            HIP_HIPCTXPUSHCURRENT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,             HIP_HIPCTXSETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,            HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,                 HIP_HIPDEVICEGET,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,        HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,            HIP_HIPDEVICEGETCOUNT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,             HIP_HIPDEVICEGETNAME,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,            HIP_HIPDEVICETOTALMEM,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,          HIP_HIPDRIVERGETVERSION,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreate,               HIP_HIPEVENTCREATE,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreateWithFlags,   hipEventCreateWithFlags,      HIP_HIPEVENTCREATEWITHFLAGS,    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,              HIP_HIPEVENTDESTROY,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,          HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,               HIP_HIPEVENTRECORD,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,          HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,          HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,           HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,         HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                      HIP_HIPINIT,                    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,        HIP_HIPLAUNCHKERNEL,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                    HIP_HIPMEMALLOC,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                      HIP_HIPMEMFREE,                 HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,                HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,                HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,                HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,                HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,                 HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                  HIP_HIPMEMSETD8,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,           HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,           HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,           HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,            HIP_HIPMEMSETD32ASYNC,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,             HIP_HIPMEMSETD8ASYNC,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,         HIP_HIPMODULEGETFUNCTION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,           HIP_HIPMODULEGETGLOBAL,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,          HIP_HIPMODULELOADDATAEX,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,              HIP_HIPMODULEUNLOAD,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,         HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDevice,              hipSetDevice,                 HIP_HIPSETDEVICE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDeviceFlags,         hipSetDeviceFlags,            HIP_HIPSETDEVICEFLAGS,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,              HIP_HIPSTREAMCREATE,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreateWithFlags,  hipStreamCreateWithFlags,     HIP_HIPSTREAMCREATEWITHFLAGS,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,             HIP_HIPSTREAMDESTROY,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,         HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,  HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR, HIP, 1);
 
   return 0;
 }
@@ -507,13 +511,13 @@ int hc_hipDriverGetVersion (void *hashcat_ctx, int *driverVersion)
   return 0;
 }
 
-int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags)
+int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipEventCreate (phEvent, Flags);
+  const hipError_t HIP_err = hip->hipEventCreate (phEvent);
 
   if (HIP_err != hipSuccess)
   {
@@ -534,6 +538,33 @@ int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flag
   return 0;
 }
 
+int hc_hipEventCreateWithFlags (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipEventCreateWithFlags (phEvent, flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipEventDestroy (void *hashcat_ctx, hipEvent_t hEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@@ -1211,13 +1242,67 @@ int hc_hipRuntimeGetVersion (void *hashcat_ctx, int *runtimeVersion)
   return 0;
 }
 
-int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+int hc_hipSetDevice (void *hashcat_ctx, hipDevice_t dev)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipStreamCreate (phStream, Flags);
+  const hipError_t HIP_err = hip->hipSetDevice (dev);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipSetDeviceFlags (void *hashcat_ctx, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipSetDeviceFlags (flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreate (phStream);
 
   if (HIP_err != hipSuccess)
   {
@@ -1238,6 +1323,33 @@ int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int F
   return 0;
 }
 
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreateWithFlags (phStream, Flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipStreamDestroy (void *hashcat_ctx, hipStream_t hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/selftest.c b/src/selftest.c
index 5e744f88b..6c63a3418 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -1272,7 +1272,7 @@ HC_API_CALL void *thread_selftest (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   const int rc_selftest = process_selftest (hashcat_ctx, device_param);
@@ -1303,8 +1303,6 @@ HC_API_CALL void *thread_selftest (void *p)
   if (device_param->is_hip == true)
   {
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return NULL;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
   }
 
   if (bridge_ctx->enabled == true)