diff --git a/BUILD_WSL.md b/BUILD_WSL.md
index cd5b6fd5b..957208381 100644
--- a/BUILD_WSL.md
+++ b/BUILD_WSL.md
@@ -19,7 +19,7 @@ cd win-iconv/
 cmake -D WIN_ICONV_BUILD_EXECUTABLE=OFF -D CMAKE_INSTALL_PREFIX=/opt/win-iconv-64 -D CMAKE_CXX_COMPILER=$(which x86_64-w64-mingw32-g++) -D CMAKE_C_COMPILER=$(which x86_64-w64-mingw32-gcc) -D CMAKE_SYSTEM_NAME=Windows
 sudo make install
 cd ../
-wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst
+wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.11-1-any.pkg.tar.zst
 sudo mkdir /opt/win-python
 sudo tar --zstd -xf mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst -C /opt/win-python
 ```
diff --git a/OpenCL/inc_amp.h b/OpenCL/inc_amp.h
index 5db6a21d8..e49b23959 100644
--- a/OpenCL/inc_amp.h
+++ b/OpenCL/inc_amp.h
@@ -16,7 +16,7 @@
   GLOBAL_AS   const bf_t          *bfs_buf,    \
   CONSTANT_AS const u32           &combs_mode, \
   CONSTANT_AS const u64           &gid_max,    \
-                    uint           hc_gid [[ thread_position_in_grid ]]
+                    uint3          hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index a23efd42b..b13dc4d5f 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -1946,6 +1946,19 @@ DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)
 
 #endif
 
+/**
+ * arithmetic operations
+ */
+
+DECLSPEC u32 hc_umulhi (const u32 x, const u32 y)
+{
+  #if defined IS_CUDA || defined IS_HIP
+  return __umulhi (x, y);
+  #else
+  return h32_from_64_S ((u64) x * (u64) y);
+  #endif
+}
+
 /**
  * pure scalar functions
  */
@@ -41405,7 +41418,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
   #endif
 
   #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
-
   const int offset_mod_4 = offset & 3;
 
   const int offset_minus_4 = 4 - offset_mod_4;
@@ -41979,6 +41991,582 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
       w0[1] = 0;
       w0[0] = 0;
       break;
+
+    case 16:
+      w7[3] = hc_byte_perm_S (w3[2], w3[3], selector);
+      w7[2] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[1] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[0] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w6[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w5[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w4[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 17:
+      w7[3] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[2] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[1] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[0] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 18:
+      w7[3] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[2] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[1] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[0] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 19:
+      w7[3] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[2] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[1] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[0] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 20:
+      w7[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 21:
+      w7[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 22:
+      w7[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 23:
+      w7[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 24:
+      w7[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 25:
+      w7[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 26:
+      w7[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 27:
+      w7[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 28:
+      w7[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 29:
+      w7[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 30:
+      w7[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 31:
+      w7[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[2] = 0;
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
   }
   #endif
 }
diff --git a/OpenCL/inc_common.h b/OpenCL/inc_common.h
index 0b51fbd29..d0b2ed989 100644
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@@ -124,10 +124,10 @@
 
 #if defined IS_METAL
 #define KERN_ATTR_MAIN_PARAMS                       \
-  uint hc_gid [[ thread_position_in_grid ]],        \
-  uint hc_lid [[ thread_position_in_threadgroup ]], \
-  uint hc_lsz [[ threads_per_threadgroup ]],        \
-  uint hc_bid [[ threadgroup_position_in_grid ]]
+  uint3 hc_gid [[ thread_position_in_grid ]],        \
+  uint3 hc_lid [[ thread_position_in_threadgroup ]], \
+  uint3 hc_lsz [[ threads_per_threadgroup ]],        \
+  uint3 hc_bid [[ threadgroup_position_in_grid ]]
 #endif // IS_METAL
 
 /*
@@ -284,6 +284,10 @@ DECLSPEC u32  hc_bfe_S          (const u32  a, const u32  b, const u32  c);
 DECLSPEC u32x hc_lop_0x96       (const u32x a, const u32x b, const u32x c);
 DECLSPEC u32  hc_lop_0x96_S     (const u32  a, const u32  b, const u32  c);
 
+// arithmetic operations
+
+DECLSPEC u32  hc_umulhi (const u32 x, const u32 y);
+
 // legacy common code
 
 DECLSPEC int ffz (const u32 v);
diff --git a/OpenCL/inc_hash_argon2.cl b/OpenCL/inc_hash_argon2.cl
new file mode 100644
index 000000000..c87179d2c
--- /dev/null
+++ b/OpenCL/inc_hash_argon2.cl
@@ -0,0 +1,401 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ *
+ * Warp code based on original work by Ondrej Mosnáček
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_hash_blake2b.h"
+#include "inc_hash_argon2.h"
+
+DECLSPEC void argon2_initial_block (PRIVATE_AS const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
+{
+  blake2b_ctx_t ctx;
+
+  blake2b_init (&ctx);
+
+  u64 blake_buf[16] = { 0 };
+
+  blake_buf[0] = sizeof(argon2_block_t);
+
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 4);
+  blake2b_update (&ctx, in, 64);
+
+  blake_buf[0] = hl32_to_64 (lane, blocknum);
+
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 8);
+
+  blake2b_final (&ctx);
+
+  GLOBAL_AS u64 *out = blocks[(blocknum * parallelism) + lane].values;
+
+  out[0] = ctx.h[0];
+  out[1] = ctx.h[1];
+  out[2] = ctx.h[2];
+  out[3] = ctx.h[3];
+
+  for (u32 off = 4; off < 124; off += 4)
+  {
+    for (u32 idx = 0; idx < 8; idx++) blake_buf[idx] = ctx.h[idx];
+
+    blake2b_init (&ctx);
+    blake2b_transform (ctx.h, blake_buf, 64, BLAKE2B_FINAL);
+
+    out[off + 0] = ctx.h[0];
+    out[off + 1] = ctx.h[1];
+    out[off + 2] = ctx.h[2];
+    out[off + 3] = ctx.h[3];
+  }
+
+  out[124] = ctx.h[4];
+  out[125] = ctx.h[5];
+  out[126] = ctx.h[6];
+  out[127] = ctx.h[7];
+}
+
+DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u64 *blockhash)
+{
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  u32 option_input[32] = { 0 };
+
+  option_input[0] = options->parallelism;
+  option_input[1] = options->digest_len;
+  option_input[2] = options->memory_usage_in_kib;
+  option_input[3] = options->iterations;
+  option_input[4] = options->version;
+  option_input[5] = options->type;
+
+  blake2b_update (&ctx, option_input, 24);
+
+  u32 len_input[32] = { 0 };
+
+  len_input[0] = pw->pw_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, pw->i, pw->pw_len);
+
+  len_input[0] = salt->salt_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, salt->salt_buf, salt->salt_len);
+
+  len_input[0] = 0;
+
+  blake2b_update (&ctx, len_input, 4); // secret (K)
+  blake2b_update (&ctx, len_input, 4); // associated data (X)
+
+  blake2b_final (&ctx);
+
+  for (u32 idx = 0; idx < 8; idx++) blockhash[idx] = ctx.h[idx];
+}
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
+                           PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
+{
+  u64 blockhash[16] = { 0 };
+
+  argon2_initial_hash (pw, salt, options, blockhash);
+
+  // Generate the first two blocks of each lane
+  for (u32 lane = 0; lane < options->parallelism; lane++)
+  {
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 0, options->parallelism, out);
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 1, options->parallelism, out);
+  }
+}
+
+// TODO: reconsider 'trunc_mul()'
+DECLSPEC u64 trunc_mul (u64 x, u64 y)
+{
+  const u32 xlo = (u32) x;
+  const u32 ylo = (u32) y;
+  return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
+}
+
+DECLSPEC inline u32 argon2_ref_address (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, u32 index, u64 pseudo_random)
+{
+  u32 ref_lane = 0;
+  u32 ref_area = 0;
+  u32 ref_index = 0;
+
+  if ((pos->pass == 0) && (pos->slice == 0))
+  {
+    ref_lane = pos->lane;
+  }
+  else
+  {
+    ref_lane = h32_from_64_S (pseudo_random) % options->parallelism;
+  }
+
+  ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
+
+  ref_area *= options->segment_length;
+
+  if ((ref_lane == pos->lane) || (index == 0))
+  {
+      ref_area += (index - 1);
+  }
+
+  // if ref_area == 0xFFFFFFFF => bug
+
+  const u32 j1 = l32_from_64_S (pseudo_random);
+
+  ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
+
+  if (pos->pass > 0)
+  {
+    ref_index += (pos->slice + 1) * options->segment_length;
+
+    if (ref_index >= options->lane_length)
+    {
+      ref_index -= options->lane_length;
+    }
+  }
+
+  return (options->parallelism * ref_index) + ref_lane;
+}
+
+DECLSPEC void swap_u64 (PRIVATE_AS u64 *x, PRIVATE_AS u64 *y)
+{
+  u64 tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+DECLSPEC void transpose_permute_block (u64 R[4], int argon2_thread)
+{
+  if (argon2_thread & 0x08)
+  {
+    swap_u64 (&R[0], &R[2]);
+    swap_u64 (&R[1], &R[3]);
+  }
+  if (argon2_thread & 0x04)
+  {
+    swap_u64 (&R[0], &R[1]);
+    swap_u64 (&R[2], &R[3]);
+  }
+}
+
+DECLSPEC int argon2_shift (int idx, int argon2_thread)
+{
+  const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
+  return (argon2_thread & 0x0e) | (((argon2_thread & 0x11) + delta + 0x0e) & 0x11);
+}
+
+DECLSPEC void argon2_hash_block (u64 R[4], int argon2_thread, LOCAL_AS u64 *shuffle_buf, int argon2_lsz)
+{
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  transpose_permute_block (R, argon2_thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx],  (argon2_thread & 0x1c) | ((argon2_thread + idx) & 0x03), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], ((argon2_thread & 0x1c) | ((argon2_thread - idx) & 0x03)) ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  transpose_permute_block (R, argon2_thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift (idx, argon2_thread), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift ((4 - idx), argon2_thread), argon2_thread, argon2_lsz);
+}
+
+DECLSPEC void argon2_next_addresses (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, PRIVATE_AS u32 *addresses, u32 start_index, u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
+{
+  u64 Z[4];
+
+  Z[0] = 0;
+  Z[1] = 0;
+  Z[2] = 0;
+  Z[3] = 0;
+
+  u64 tmp[4];
+
+  tmp[0] = 0;
+  tmp[1] = 0;
+  tmp[2] = 0;
+  tmp[3] = 0;
+
+  switch (argon2_thread)
+  {
+    case 0:  Z[0] = pos->pass;                   break;
+    case 1:  Z[0] = pos->lane;                   break;
+    case 2:  Z[0] = pos->slice;                  break;
+    case 3:  Z[0] = options->memory_block_count; break;
+    case 4:  Z[0] = options->iterations;         break;
+    case 5:  Z[0] = options->type;               break;
+    case 6:  Z[0] = (start_index / 128) + 1;     break;
+    default: Z[0] = 0;                           break;
+  }
+
+  tmp[0] = Z[0];
+
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
+
+  Z[0]  ^= tmp[0];
+
+  for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
+
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
+
+  for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
+
+  for (u32 i = 0, index = (start_index + argon2_thread); i < 4; i++, index += THREADS_PER_LANE)
+  {
+    addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
+  }
+
+  // if addresses[0] == 0xFFFFFFFE => bug
+}
+
+DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
+{
+  switch (index)
+  {
+    case 0:
+      return array[0];
+    case 1:
+      return array[1];
+    case 2:
+      return array[2];
+    case 3:
+      return array[3];
+  }
+
+  return -1;
+}
+
+DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 argon2_thread)
+{
+  // Apply wrap-around to previous block index if the current block is the first block in the lane
+  const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
+
+  GLOBAL_AS argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
+
+  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+
+  return &blocks[(index_in_lane * options->parallelism) + lane];
+}
+
+DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
+                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
+{
+  for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
+  {
+    u32 ref_address;
+
+    if (indep_addr)
+    {
+      ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, index, argon2_thread, argon2_lsz);
+    }
+    else
+    {
+      ref_address = argon2_ref_address (options, pos, index, R[0]);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, 0, argon2_thread, argon2_lsz);
+    }
+
+    GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
+
+    u64 tmp[4] = { 0 };
+
+    // First pass is overwrite, next passes are XOR with previous
+    if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
+    {
+      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+    }
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+
+    for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
+
+    argon2_hash_block (R, argon2_thread, shuffle_buf, argon2_lsz);
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
+
+    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread] = R[idx];
+  }
+}
+
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  // We have already generated the first two blocks of each lane (for the first pass)
+  const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
+  const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
+
+  u64 R[4];
+
+  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, argon2_thread);
+
+  if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
+  {
+    for (u32 block_index = 0; block_index < options->segment_length; block_index += 128)
+    {
+      const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
+      const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
+
+      u32 addresses[4] = { 0, 0, 0, 0 };
+
+      argon2_next_addresses (options, pos, addresses, block_index, argon2_thread, shuffle_buf, argon2_lsz);
+      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
+
+      cur_block += (end_index - start_index) * options->parallelism;
+    }
+  }
+  else
+  {
+    u32 addresses[4] = { 0 };
+
+    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
+  }
+}
+
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out)
+{
+  const u32 lane_length = options->lane_length;
+  const u32 lanes = options->parallelism;
+
+  argon2_block_t final_block = { };
+
+  for (u32 l = 0; l < lanes; l++)
+  {
+    for (u32 idx = 0; idx < 128; idx++) final_block.values[idx] ^= blocks[((lane_length - 1) * lanes) + l].values[idx];
+  }
+
+  u32 output_len [32] = {0};
+  output_len [0] = options->digest_len;
+
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  // Override default (0x40) value in BLAKE2b
+  ctx.h[0] ^= 0x40 ^ options->digest_len; 
+
+  blake2b_update (&ctx, output_len, 4);
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) final_block.values, sizeof(final_block));
+
+  blake2b_final (&ctx);
+
+  for (uint i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
+  {
+    out [i + 0] = l32_from_64_S (ctx.h[idx]);
+    out [i + 1] = h32_from_64_S (ctx.h[idx]);
+  }
+}
diff --git a/OpenCL/inc_hash_argon2.h b/OpenCL/inc_hash_argon2.h
new file mode 100644
index 000000000..a03b7c480
--- /dev/null
+++ b/OpenCL/inc_hash_argon2.h
@@ -0,0 +1,157 @@
+
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_ARGON2_H
+#define INC_HASH_ARGON2_H
+
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
+#define THREADS_PER_LANE 32
+#define FULL_MASK 0xffffffff
+
+#define BLAKE2B_OUTBYTES 64
+#define ARGON2_SYNC_POINTS 4
+#define ARGON2_ADDRESSES_IN_BLOCK 128
+
+#define TYPE_D  0
+#define TYPE_I  1
+#define TYPE_ID 2
+
+#if defined IS_CUDA
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl_sync ((mask),(var),(srcLane))
+#elif defined IS_HIP
+// attention hard coded 32 warps for hip here
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl ((var),(srcLane),32)
+#elif defined IS_OPENCL
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) hc__shfl ((shfbuf),(var),(srcLane),(argon2_thread),(argon2_lsz))
+
+#if defined IS_AMD && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  const u32 idx = src_lane << 2;
+
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  const u32 l32r = __builtin_amdgcn_ds_bpermute (idx, l32);
+  const u32 h32r = __builtin_amdgcn_ds_bpermute (idx, h32);
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#elif defined IS_NV && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  u32 l32r;
+  u32 h32r;
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(l32r)
+      : "r"(l32), "r"(src_lane));
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(h32r)
+      : "r"(h32), "r"(src_lane));
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#else
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  shuffle_buf[argon2_thread] = var;
+
+  barrier (CLK_LOCAL_MEM_FENCE);
+
+  const u64 out = shuffle_buf[src_lane & (argon2_lsz - 1)];
+
+  return out;
+}
+#endif
+
+#elif defined IS_METAL
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) hc__shfl ((shfbuf),(var),(srcLane),(argon2_thread),(argon2_lsz))
+
+DECLSPEC u64 hc__shfl (LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  shuffle_buf[argon2_thread] = var;
+
+  SYNC_THREADS();
+
+  const u64 out = shuffle_buf[src_lane & (argon2_lsz - 1)];
+
+  return out;
+}
+#endif
+
+#define ARGON2_G(a,b,c,d)                \
+{                                        \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot32_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = blake2b_rot24_S (b ^ c);           \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot16_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = hc_rotr64_S (b ^ c, 63);           \
+}
+
+#define ARGON2_P()                       \
+{                                        \
+  ARGON2_G(v[0], v[4], v[8], v[12]);     \
+  ARGON2_G(v[1], v[5], v[9], v[13]);     \
+  ARGON2_G(v[2], v[6], v[10], v[14]);    \
+  ARGON2_G(v[3], v[7], v[11], v[15]);    \
+                                         \
+  ARGON2_G(v[0], v[5], v[10], v[15]);    \
+  ARGON2_G(v[1], v[6], v[11], v[12]);    \
+  ARGON2_G(v[2], v[7], v[8], v[13]);     \
+  ARGON2_G(v[3], v[4], v[9], v[14]);     \
+}
+
+typedef struct argon2_block
+{
+  u64 values[128];
+
+} argon2_block_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+  u32 digest_len;
+
+} argon2_options_t;
+
+typedef struct argon2_pos
+{
+  u32 pass;
+  u32 slice;
+  u32 lane;
+
+} argon2_pos_t;
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz);
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out);
+
+#endif // INC_HASH_ARGON2_H
diff --git a/OpenCL/inc_hash_blake2b.cl b/OpenCL/inc_hash_blake2b.cl
index 03a44b3ff..b205b18a7 100644
--- a/OpenCL/inc_hash_blake2b.cl
+++ b/OpenCL/inc_hash_blake2b.cl
@@ -409,7 +409,7 @@ DECLSPEC void blake2b_update (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS const u3
   u32 w6[4];
   u32 w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -499,7 +499,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co
   u32 w6[4];
   u32 w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -580,7 +580,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co
 
 DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx)
 {
-  blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL);
+  blake2b_transform (ctx->h, ctx->m, ctx->len, (u64) BLAKE2B_FINAL);
 }
 
 DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0)
@@ -813,7 +813,7 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA
   u32x w6[4];
   u32x w7[4];
 
-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -894,5 +894,5 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA
 
 DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
 {
-  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL);
+  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u64) BLAKE2B_FINAL);
 }
diff --git a/OpenCL/inc_hash_blake2s.cl b/OpenCL/inc_hash_blake2s.cl
index 77ecc02fb..99d4389fb 100644
--- a/OpenCL/inc_hash_blake2s.cl
+++ b/OpenCL/inc_hash_blake2s.cl
@@ -322,7 +322,7 @@ DECLSPEC void blake2s_update (PRIVATE_AS blake2s_ctx_t *ctx, PRIVATE_AS const u3
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -376,7 +376,7 @@ DECLSPEC void blake2s_update_global (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_AS co
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -516,7 +516,7 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
   u32 w2[4];
   u32 w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -597,13 +597,11 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
   blake2s_update_64 (ctx, w0, w1, w2, w3, len - (u32) pos1);
 }
 
-
 DECLSPEC void blake2s_final (PRIVATE_AS blake2s_ctx_t *ctx)
 {
-  blake2s_transform (ctx->h, ctx->m, ctx->len, BLAKE2S_FINAL);
+  blake2s_transform (ctx->h, ctx->m, ctx->len, (u32) BLAKE2S_FINAL);
 }
 
-
 DECLSPEC void blake2s_hmac_init_64 (PRIVATE_AS blake2s_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3)
 {
   u32 a0[4];
@@ -1158,7 +1156,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA
   u32x w2[4];
   u32x w3[4];
 
-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative
 
   int pos1;
   int pos4;
@@ -1207,7 +1205,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA
 
 DECLSPEC void blake2s_final_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx)
 {
-  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2S_FINAL);
+  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u32) BLAKE2S_FINAL);
 }
 
 DECLSPEC void blake2s_hmac_init_vector_64 (PRIVATE_AS blake2s_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3)
diff --git a/OpenCL/inc_markov.h b/OpenCL/inc_markov.h
index 3aae8f7fc..1ec187b8e 100644
--- a/OpenCL/inc_markov.h
+++ b/OpenCL/inc_markov.h
@@ -19,7 +19,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_R_MARKOV                \
   GLOBAL_AS         bf_t *pws_buf_r,      \
@@ -31,7 +31,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_C_MARKOV                \
   GLOBAL_AS         pw_t *pws_buf,        \
@@ -43,7 +43,7 @@
   CONSTANT_AS const u32  &bits14,         \
   CONSTANT_AS const u32  &bits15,         \
   CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 9af10e9ba..0a386f22e 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -104,9 +104,19 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@@ -114,15 +124,34 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
   return (blockIdx.x * blockDim.x) + threadIdx.x;
 }
 
-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC u32x rotl32 (const u32x a, const int n)
@@ -305,9 +334,19 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
   return atomicOr (p, val);
 }
 
-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@@ -315,15 +354,34 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
   return (blockIdx.x * blockDim.x) + threadIdx.x;
 }
 
-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
+
+  return (size_t) -1;
 }
 
-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }
+
+  return (size_t) -1;
 }
 
 DECLSPEC u32x rotl32 (const u32x a, const int n)
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index cc6489afe..9729d4fad 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -27,8 +27,9 @@ DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
 DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_id    (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_size  (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
+DECLSPEC size_t get_local_id    (const u32 dimindx);
+DECLSPEC size_t get_local_size  (const u32 dimindx);
 
 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
@@ -48,7 +49,8 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
-DECLSPEC size_t get_global_id   (const u32 dimindx);
+DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
 DECLSPEC size_t get_local_id    (const u32 dimindx);
 DECLSPEC size_t get_local_size  (const u32 dimindx);
 
@@ -71,10 +73,25 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);
 
-#define get_global_id(param) hc_gid
-#define get_local_id(param) hc_lid
-#define get_local_size(param) hc_lsz
-#define get_group_id(param) hc_bid
+#define get_global_id(dimindx)   \
+  ((dimindx) == 0 ? hc_gid.x :   \
+   (dimindx) == 1 ? hc_gid.y :   \
+   (dimindx) == 2 ? hc_gid.z : -1)
+
+#define get_group_id(dimindx)    \
+  ((dimindx) == 0 ? hc_bid.x :   \
+   (dimindx) == 1 ? hc_bid.y :   \
+   (dimindx) == 2 ? hc_bid.z : -1)
+
+#define get_local_id(dimindx)    \
+  ((dimindx) == 0 ? hc_lid.x :   \
+   (dimindx) == 1 ? hc_lid.y :   \
+   (dimindx) == 2 ? hc_lid.z : -1)
+
+#define get_local_size(dimindx)  \
+  ((dimindx) == 0 ? hc_lsz.x :   \
+   (dimindx) == 1 ? hc_lsz.y :   \
+   (dimindx) == 2 ? hc_lsz.z : -1)
 
 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
diff --git a/OpenCL/inc_shared.h b/OpenCL/inc_shared.h
index 16f2e2c4c..6518c30c6 100644
--- a/OpenCL/inc_shared.h
+++ b/OpenCL/inc_shared.h
@@ -13,28 +13,28 @@
   GLOBAL_AS         u32      *pws_comp, \
   GLOBAL_AS         pw_t     *pws_buf,  \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_MEMSET            \
   GLOBAL_AS         uint4    *buf,      \
   CONSTANT_AS const u32      &value,    \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_BZERO             \
   GLOBAL_AS         uint4    *buf,      \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_ATINIT            \
   GLOBAL_AS         pw_t     *buf,      \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #define KERN_ATTR_GPU_UTF8_TO_UTF16     \
   GLOBAL_AS         pw_t     *pws_buf,  \
   CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]
 
 #else // CUDA, HIP, OpenCL
 
diff --git a/OpenCL/inc_types.h b/OpenCL/inc_types.h
index 233e28958..a13c89b8e 100644
--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@@ -2008,6 +2008,7 @@ typedef struct salt
   u32 salt_len_pc;
   u32 salt_iter;
   u32 salt_iter2;
+  u32 salt_dimy;
   u32 salt_sign[2];
   u32 salt_repeats;
 
diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h
index a52b4c899..40414cbb2 100644
--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@@ -183,13 +183,11 @@ using namespace metal;
 #ifdef IS_CUDA
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif
 
 #ifdef IS_HIP
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif
 
 #ifdef IS_ROCM
@@ -220,7 +218,12 @@ using namespace metal;
 #define s3 w
 #endif
 
+#if HAS_SHFW == 1
+#define USE_FUNNELSHIFT
+#endif
+
 // some algorithms do not like this, eg 150, 1100, ...
+
 #ifdef NO_FUNNELSHIFT
 #undef USE_FUNNELSHIFT
 #endif
diff --git a/OpenCL/m15400_a3-optimized.cl b/OpenCL/m15400_a3-optimized.cl
index a0f856bac..f60635a1a 100644
--- a/OpenCL/m15400_a3-optimized.cl
+++ b/OpenCL/m15400_a3-optimized.cl
@@ -238,12 +238,162 @@ DECLSPEC void chacha20_transform (PRIVATE_AS const u32x *w0, PRIVATE_AS const u3
 
 KERNEL_FQ KERNEL_FA void m15400_m04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_m08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
@@ -328,12 +478,187 @@ KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 
 KERNEL_FQ KERNEL_FA void m15400_s04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }
 
+
 KERNEL_FQ KERNEL_FA void m15400_s08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }
 
 KERNEL_FQ KERNEL_FA void m15400_s16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
diff --git a/OpenCL/m18600-pure.cl b/OpenCL/m18600-pure.cl
index 54bba75d7..144df2063 100644
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@@ -636,7 +636,7 @@ KERNEL_FQ KERNEL_FA void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t
   }
 }
 
-KERNEL_FQ KERNEL_FA void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+KERNEL_FQ KERNEL_FA FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) void m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
   const u64 gid = get_global_id (0);
   const u64 lid = get_local_id (0);
diff --git a/OpenCL/m34000-pure.cl b/OpenCL/m34000-pure.cl
new file mode 100644
index 000000000..ba87d835c
--- /dev/null
+++ b/OpenCL/m34000-pure.cl
@@ -0,0 +1,152 @@
+
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#include M2S(INCLUDE_PATH/inc_hash_argon2.cl)
+#endif
+
+#define COMPARE_S M2S(INCLUDE_PATH/inc_comp_single.cl)
+#define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl)
+
+typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
+typedef struct argon2_extra
+{
+#ifndef ARGON2_TMP_ELEM
+#define ARGON2_TMP_ELEM 1
+#endif
+
+  argon2_block_t blocks[ARGON2_TMP_ELEM];
+
+} argon2_extra_t;
+
+KERNEL_FQ KERNEL_FA void m34000_init (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (gm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + gd4;
+
+  const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  argon2_init (&pws[gid], &salt_bufs[SALT_POS_HOST], &options, argon2_extra->blocks);
+}
+
+KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+  const u64 bid = get_group_id (0);
+  const u64 lid = get_local_id (1);
+  const u64 lsz = get_local_size (1);
+
+  if (bid >= GID_CNT) return;
+
+  const u32 argon2_thread = get_local_id (0);
+  const u32 argon2_lsz = get_local_size (0);
+
+  LOCAL_VK u64 shuffle_bufs[ARGON2_PARALLELISM][32];
+  LOCAL_AS u64 *shuffle_buf = shuffle_bufs[lid];
+
+  const u32 bd4 = bid / 4;
+  const u32 bm4 = bid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (bm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + bd4;
+
+  argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  options.parallelism = ARGON2_PARALLELISM;
+
+  argon2_pos_t pos;
+
+  pos.pass   = (LOOP_POS / ARGON2_SYNC_POINTS);
+  pos.slice  = (LOOP_POS % ARGON2_SYNC_POINTS);
+
+  for (u32 i = 0; i < LOOP_CNT; i++)
+  {
+    for (pos.lane = lid; pos.lane < options.parallelism; pos.lane += lsz)
+    {
+      argon2_fill_segment (argon2_extra->blocks, &options, &pos, shuffle_buf, argon2_thread, argon2_lsz);
+    }
+
+    SYNC_THREADS ();
+
+    pos.slice++;
+
+    if (pos.slice == ARGON2_SYNC_POINTS)
+    {
+      pos.slice = 0;
+      pos.pass++;
+    }
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m34000_comp (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS argon2_extra_t *V;
+
+  switch (gm4)
+  {
+    case 0: V = (GLOBAL_AS argon2_extra_t *) d_extra0_buf; break;
+    case 1: V = (GLOBAL_AS argon2_extra_t *) d_extra1_buf; break;
+    case 2: V = (GLOBAL_AS argon2_extra_t *) d_extra2_buf; break;
+    case 3: V = (GLOBAL_AS argon2_extra_t *) d_extra3_buf; break;
+  }
+
+  GLOBAL_AS argon2_extra_t *argon2_extra = V + gd4;
+
+  u32 out[8];
+
+  const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  argon2_final (argon2_extra->blocks, &options, out);
+
+  const u32 r0 = out[0];
+  const u32 r1 = out[1];
+  const u32 r2 = out[2];
+  const u32 r3 = out[3];
+
+  #define il_pos 0
+
+  #include COMPARE_M
+}
diff --git a/Python/hcshared.py b/Python/hcshared.py
index 2fcf56c70..ad4390083 100644
--- a/Python/hcshared.py
+++ b/Python/hcshared.py
@@ -6,13 +6,14 @@ import sys
 
 def extract_salts(salts_buf) -> list:
   salts=[]
-  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I 8s I I I I I I I I", salts_buf):
+  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_dimy, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I I 8s I I I I I I I I", salts_buf):
     salt_buf = salt_buf[0:salt_len]
     salt_buf_pc = salt_buf_pc[0:salt_len_pc]
     salts.append({ "salt_buf":      salt_buf,     \
                    "salt_buf_pc":   salt_buf_pc,  \
                    "salt_iter":     salt_iter,    \
                    "salt_iter2":    salt_iter2,   \
+                   "salt_dimy":     salt_dimy,    \
                    "salt_sign":     salt_sign,    \
                    "salt_repeats":  salt_repeats, \
                    "orig_pos":      orig_pos,     \
diff --git a/docs/changes.txt b/docs/changes.txt
index cc75fe622..4161e3b9a 100644
--- a/docs/changes.txt
+++ b/docs/changes.txt
@@ -70,6 +70,7 @@
 
 - Added new feature (-Y) that creates N virtual instances for each device in your system at the cost of N times the device memory consumption
 - Added options --benchmark-min and --benchmark-max to set a hash-mode range to be used during the benchmark
+- Added option --total-candidates to provide the total candidate count for an attack insteda of the internal "--keyspace" value
 - Added option --backend-devices-keepfree to configure X percentage of device memory available to keep free
 
 ##
@@ -92,6 +93,8 @@
 - Fixed bug in grep out-of-memory workaround on Unit Test
 - Fixed bug in input_tokenizer when TOKEN_ATTR_FIXED_LENGTH is used and refactor modules
 - Fixed bug in --stdout that caused certain rules to malfunction
+- Fixed bug in Hardware Monitor: prevent disable if ADL fail
+- Fixed race condition in selftest_init on OpenCL with non-blocking write
 - Fixed build failed for 10700 optimized with Apple Metal
 - Fixed build failed for 13772 and 13773 with Apple Metal
 - Fixed build failed for 18400 with Apple Metal
@@ -126,8 +129,11 @@
 - Alias Devices: Prevents hashcat, when started with x86_64 emulation on Apple Silicon, from showing the Apple M1 OpenCL CPU as an alias for the Apple M1 Metal GPU
 - Apple Driver: Automatically enable GPU support on Apple OpenCL instead of CPU support
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
+- Backend: Added workaround to get rid of internal runtimes memory leaks
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
+- Backend: Splitting backend_ctx_devices_init into smaller runtime-specific functions
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
+- Backend Info: Added --machine-readable format
 - Brain: Added sanity check and corresponding error message for invalid --brain-port values
 - Dependencies: Added sse2neon v1.8.0 (commit 658eeac)
 - Dependencies: Updated LZMA SDK to 24.09
@@ -135,26 +141,36 @@
 - Dependencies: Updated xxHash to 0.8.3 (commit 50f4226)
 - Building: Support building windows binaries on macOS using MinGW
 - Dependencies: Updated OpenCL-Headers to v2024.10.24 (commit 265df85)
+- Documents: Renamed status_code.txt in exit_status_code.txt and added device_status_code.txt
 - Documents: Updated BUILD.md and added BUILD_macOS.md (containing instructions for building windows binaries on macOS)
+- HIP Backend: Avoid deprecated functions
 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
 - Modules: Updated module_unstable_warning
 - Open Document Format: Added support for small documents with content length < 1024
 - OpenCL Backend: added workaround to set device_available_memory from CUDA/HIP alias device
+- Selftest: rename selftest function to process_selftest and splitting into 3 smaller functions
 - Status Code: Add specific return code for self-test fail (-11)
 - Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
 - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
+- Unit tests: Updated install_modules.sh with Crypt::Argon2
 - User Options: Added error message when mixing --username and --show to warn users of exponential delay
 - MetaMask: update extraction tool to support MetaMask Mobile wallets
 - SecureCRT MasterPassphrase v2: update module, pure kernels and test unit. Add optimized kernels.
 - Metal Backend: added workaround to prevent 'Infinite Loop' bug when build kernels
 - Metal Backend: added workaround to set the true Processor value in Metal devices on Apple Intel
+- Metal Backend: added support to 2D/3D Compute
 - Metal Backend: allow use of devices with Metal if runtime version is >= 200
 - Metal Backend: disable Metal devices only if at least one OpenCL device is active
+- Metal Backend: improved compute workloads calculation
 - Modules: Check UnpackSize to raise false positive with hc_decompress_rar
 - User Options: added --metal-compiler-runtime option
+- User Options: assigned -H to --hash-info
+- Hash-Info: show more details using -HH
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
+- Hardware Monitor: Splitting hwmon_ctx_init function into smaller library-specific functions
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
+- Makefile: updated MACOSX_DEPLOYMENT_TARGET to 15.0
 - Rules: Rename best64.rule to best66.rule and remove the unknown section from it
 
 * changes v6.2.5 -> v6.2.6
diff --git a/docs/device_status_code.txt b/docs/device_status_code.txt
new file mode 100644
index 000000000..bd1607409
--- /dev/null
+++ b/docs/device_status_code.txt
@@ -0,0 +1,20 @@
+Device Status Codes:
+====================
+
+ 0 = "Initializing"
+ 1 = "Autotuning"
+ 2 = "Selftest"
+ 3 = "Running"
+ 4 = "Paused"
+ 5 = "Exhausted"
+ 6 = "Cracked"
+ 7 = "Aborted"
+ 8 = "Quit"
+ 9 = "Bypass"
+10 = "Aborted (Checkpoint)"
+11 = "Aborted (Runtime)"
+12 = "Running (Checkpoint Quit requested)"
+13 = "Error"
+14 = "Aborted (Finish)"
+15 = "Running (Quit after attack requested)"
+16 = "Autodetect"
diff --git a/docs/status_codes.txt b/docs/exit_status_code.txt
similarity index 100%
rename from docs/status_codes.txt
rename to docs/exit_status_code.txt
diff --git a/docs/hashcat-python-plugin-development-guide.md b/docs/hashcat-python-plugin-development-guide.md
index 308b6d918..b93f2d66c 100644
--- a/docs/hashcat-python-plugin-development-guide.md
+++ b/docs/hashcat-python-plugin-development-guide.md
@@ -210,13 +210,13 @@ Notes:
 
 If you modify one of these plugin files, there's a trade-off: you won’t be able to contribute that code directly to the upstream Hashcat repository, since those files are meant to remain clean for demonstration purposes.
 
-To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. You can override the Python script to be loaded using `--bridge-parameter1`:
+To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. Using `--bridge-parameter1` allows you to override the Python script to be loaded:
 
 ```
-$ ./hashcat -m 73000 --bridge-parameter1 myimplementation.py hash.txt wordlist.txt ...
+$ ./hashcat -m 73000 --bridge-parameter1 ./Python/myimplementation.py hash.txt wordlist.txt ...
 ```
 
-This tells the Python bridge plugin to load `myimplementation.py` instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.
+This tells the Python bridge plugin to load `myimplementation.py` located in the local `Python` subdirectory instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.
 
 ### Design Tradeoffs and Format Considerations
 
diff --git a/extra/tab_completion/hashcat.sh b/extra/tab_completion/hashcat.sh
index f1d01e0f5..c415bcb48 100755
--- a/extra/tab_completion/hashcat.sh
+++ b/extra/tab_completion/hashcat.sh
@@ -425,7 +425,7 @@ _hashcat ()
   local HIDDEN_FILES_AGGRESSIVE="${HIDDEN_FILES}|hcmask|hcchr"
   local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s ?h ?H'
 
-  local SHORT_OPTS="-m -a -V -h -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y -R"
+  local SHORT_OPTS="-m -a -V -h -H -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y -R"
   local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --stdin-timeout-abort --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-inverse --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-json --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --deprecated-check-disable --left --username --dynamic-x --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --hash-info --backend-ignore-cuda --backend-ignore-opencl --backend-ignore-hip --backend-ignore-metal --backend-info --backend-devices --backend-devices-virtmulti --backend-devices-virthost --backend-devices-keepfree --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --multiply-accel-disable --self-test-disable --slow-candidates --brain-server --brain-server-timer --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password --identify --bridge-parameter1 --bridge-parameter2 --bridge-parameter3 --bridge-parameter4"
   local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --stdin-timeout-abort --markov-hcstat2 --markov-threshold --runtime --session --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --backend-devices --backend-devices-virtmulti --backend-devices-virthost --backend-devices-keepfree --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-server-timer --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-session-whitelist --bridge-parameter1 --bridge-parameter2 --bridge-parameter3 --bridge-parameter4"
 
@@ -729,11 +729,11 @@ _hashcat ()
 
   while [ ${h} -le ${COMP_CWORD} ]; do
 
-    if   [[ "${COMP_WORDS[h]}" == "-a" ]]; then
+    if [[ "${COMP_WORDS[h]}" == "-a" ]]; then
 
       attack_mode=${COMP_WORDS[$((h + 1))]}
 
-    elif   [[ "${COMP_WORDS[h]}" == -a* ]]; then
+    elif [[ "${COMP_WORDS[h]}" == -a* ]]; then
 
       attack_mode=${COMP_WORDS[h]:2}
 
diff --git a/include/ext_OpenCL.h b/include/ext_OpenCL.h
index d49f82c79..7044ac667 100644
--- a/include/ext_OpenCL.h
+++ b/include/ext_OpenCL.h
@@ -51,7 +51,7 @@ typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEFILLBUFFER)       (cl_comman
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUECOPYBUFFER)       (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
 typedef void *           (CL_API_CALL *OCL_CLENQUEUEMAPBUFFER)        (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUENDRANGEKERNEL)    (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEUNMAPMEMOBJECT)   (cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEWRITEBUFFER)      (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLFINISH)                  (cl_command_queue);
diff --git a/include/ext_cuda.h b/include/ext_cuda.h
index 44ae01cd6..a80af5367 100644
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@@ -1154,14 +1154,19 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUINIT)                   (unsigned int);
 typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL)           (CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC)               (CUdeviceptr *, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST)           (void **, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD)             (CUdeviceptr, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH)             (void *, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD)             (CUdeviceptr, const void *, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32)              (CUdeviceptr, unsigned int, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8)               (CUdeviceptr, unsigned char, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC)        (CUdeviceptr, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC)        (void *, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC)        (CUdeviceptr, const void *, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE)                (CUdeviceptr);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST)            (void *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO)             (size_t *, size_t *);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETFUNCTION)      (CUfunction *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETGLOBAL)        (CUdeviceptr *, size_t *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOAD)             (CUmodule *, const char *);
@@ -1217,14 +1222,19 @@ typedef struct hc_cuda_lib
   CUDA_CULAUNCHKERNEL           cuLaunchKernel;
   CUDA_CUMEMALLOC               cuMemAlloc;
   CUDA_CUMEMALLOCHOST           cuMemAllocHost;
+  CUDA_CUMEMCPYDTOD             cuMemcpyDtoD;
+  CUDA_CUMEMCPYDTOH             cuMemcpyDtoH;
+  CUDA_CUMEMCPYHTOD             cuMemcpyHtoD;
+  CUDA_CUMEMSETD32              cuMemsetD32;
+  CUDA_CUMEMSETD8               cuMemsetD8;
   CUDA_CUMEMCPYDTODASYNC        cuMemcpyDtoDAsync;
   CUDA_CUMEMCPYDTOHASYNC        cuMemcpyDtoHAsync;
   CUDA_CUMEMCPYHTODASYNC        cuMemcpyHtoDAsync;
+  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
+  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
   CUDA_CUMEMFREE                cuMemFree;
   CUDA_CUMEMFREEHOST            cuMemFreeHost;
   CUDA_CUMEMGETINFO             cuMemGetInfo;
-  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
-  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
   CUDA_CUMODULEGETFUNCTION      cuModuleGetFunction;
   CUDA_CUMODULEGETGLOBAL        cuModuleGetGlobal;
   CUDA_CUMODULELOAD             cuModuleLoad;
@@ -1272,13 +1282,18 @@ int hc_cuFuncSetAttribute      (void *hashcat_ctx, CUfunction hfunc, CUfunction_
 int hc_cuInit                  (void *hashcat_ctx, unsigned int Flags);
 int hc_cuLaunchKernel          (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc              (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyDtoH            (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyHtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+int hc_cuMemsetD32             (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N);
+int hc_cuMemsetD8              (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N);
 int hc_cuMemcpyDtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyHtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
-int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuMemsetD32Async        (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
 int hc_cuMemsetD8Async         (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuModuleGetFunction     (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleGetGlobal       (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx      (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
diff --git a/include/ext_hip.h b/include/ext_hip.h
index 4cc6fb797..3c0b8433a 100644
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
@@ -12,20 +12,22 @@
 
 // start: driver_types.h
 
-typedef void* hipDeviceptr_t;
+typedef void *hipDeviceptr_t;
+
+typedef enum hipFunction_attribute
+{
+  HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            // The maximum number of threads per block. Depends on function and device.
+  HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                // The statically allocated shared memory size in bytes per block required by the function.
+  HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 // The user-allocated constant memory by the function in bytes.
+  HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 // The local memory usage of each thread by this function in bytes.
+  HIP_FUNC_ATTRIBUTE_NUM_REGS,                         // The number of registers used by each thread of this function.
+  HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      // PTX version
+  HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   // Binary version
+  HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    // Cache mode
+  HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    // The maximum dynamic shared memory per block for this function in bytes.
+  HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, // The shared memory carveout preference in percent of the maximum shared memory.
+  HIP_FUNC_ATTRIBUTE_MAX
 
-typedef enum hipFunction_attribute {
-    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,            ///< The maximum number of threads per block. Depends on function and device.
-    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,                ///< The statically allocated shared memory size in bytes per block required by the function.
-    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,                 ///< The user-allocated constant memory by the function in bytes.
-    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,                 ///< The local memory usage of each thread by this function in bytes.
-    HIP_FUNC_ATTRIBUTE_NUM_REGS,                         ///< The number of registers used by each thread of this function.
-    HIP_FUNC_ATTRIBUTE_PTX_VERSION,                      ///< PTX version
-    HIP_FUNC_ATTRIBUTE_BINARY_VERSION,                   ///< Binary version
-    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,                    ///< Cache mode
-    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,    ///< The maximum dynamic shared memory per block for this function in bytes.
-    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, ///< The shared memory carveout preference in percent of the maximum shared memory.
-    HIP_FUNC_ATTRIBUTE_MAX
 } hipFunction_attribute;
 
 // stop: driver_types.h
@@ -47,120 +49,102 @@ typedef struct ihipModuleSymbol_t* hipFunction_t;
 #define __HIP_NODISCARD
 #endif
 
-typedef enum __HIP_NODISCARD hipError_t {
-    hipSuccess = 0,  ///< Successful completion.
-    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
-                               ///< or not in an acceptable range.
-    hipErrorOutOfMemory = 2,
-    // Deprecated
-    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
-    hipErrorNotInitialized = 3,
-    // Deprecated
-    hipErrorInitializationError = 3,
-    hipErrorDeinitialized = 4,
-    hipErrorProfilerDisabled = 5,
-    hipErrorProfilerNotInitialized = 6,
-    hipErrorProfilerAlreadyStarted = 7,
-    hipErrorProfilerAlreadyStopped = 8,
-    hipErrorInvalidConfiguration = 9,
-    hipErrorInvalidPitchValue = 12,
-    hipErrorInvalidSymbol = 13,
-    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
-    hipErrorInsufficientDriver = 35,
-    hipErrorMissingConfiguration = 52,
-    hipErrorPriorLaunchFailure = 53,
-    hipErrorInvalidDeviceFunction = 98,
-    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidImage = 200,
-    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent = 202,
-    hipErrorMapFailed = 205,
-    // Deprecated
-    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorUnmapFailed = 206,
-    hipErrorArrayIsMapped = 207,
-    hipErrorAlreadyMapped = 208,
-    hipErrorNoBinaryForGpu = 209,
-    hipErrorAlreadyAcquired = 210,
-    hipErrorNotMapped = 211,
-    hipErrorNotMappedAsArray = 212,
-    hipErrorNotMappedAsPointer = 213,
-    hipErrorECCNotCorrectable = 214,
-    hipErrorUnsupportedLimit = 215,
-    hipErrorContextAlreadyInUse = 216,
-    hipErrorPeerAccessUnsupported = 217,
-    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext = 219,
-    hipErrorInvalidSource = 300,
-    hipErrorFileNotFound = 301,
-    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed = 303,
-    hipErrorOperatingSystem = 304,
-    hipErrorInvalidHandle = 400,
-    // Deprecated
-    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorIllegalState = 401, ///< Resource required is not in a valid state to perform operation.
-    hipErrorNotFound = 500,
-    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
-                             ///< ready.  This is not actually an error, but is used to distinguish
-                             ///< from hipSuccess (which indicates completion).  APIs that return
-                             ///< this error include hipEventQuery and hipStreamQuery.
-    hipErrorIllegalAddress = 700,
-    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
-    hipErrorLaunchTimeOut = 702,
-    hipErrorPeerAccessAlreadyEnabled =
-        704,  ///< Peer access was already enabled from the current device.
-    hipErrorPeerAccessNotEnabled =
-        705,  ///< Peer access was never enabled from the current device.
-    hipErrorSetOnActiveProcess = 708,
-    hipErrorContextIsDestroyed = 709,
-    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
-    hipErrorHostMemoryAlreadyRegistered =
-        712,  ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered =
-        713,  ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorLaunchFailure =
-        719,  ///< An exception occurred on the device while executing a kernel.
-    hipErrorCooperativeLaunchTooLarge =
-        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
-              ///< that was launched via cooperative launch APIs exceeds the maximum number of
-              ///< allowed blocks for the current device
-    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
-    hipErrorStreamCaptureUnsupported = 900,  ///< The operation is not permitted when the stream
-                                             ///< is capturing.
-    hipErrorStreamCaptureInvalidated = 901,  ///< The current capture sequence on the stream
-                                             ///< has been invalidated due to a previous error.
-    hipErrorStreamCaptureMerge = 902,  ///< The operation would have resulted in a merge of
-                                       ///< two independent capture sequences.
-    hipErrorStreamCaptureUnmatched = 903,  ///< The capture was not initiated in this stream.
-    hipErrorStreamCaptureUnjoined = 904,  ///< The capture sequence contains a fork that was not
-                                          ///< joined to the primary stream.
-    hipErrorStreamCaptureIsolation = 905,  ///< A dependency would have been created which crosses
-                                           ///< the capture sequence boundary. Only implicit
-                                           ///< in-stream ordering dependencies  are allowed
-                                           ///< to cross the boundary
-    hipErrorStreamCaptureImplicit = 906,  ///< The operation would have resulted in a disallowed
-                                          ///< implicit dependency on a current capture sequence
-                                          ///< from hipStreamLegacy.
-    hipErrorCapturedEvent = 907,  ///< The operation is not permitted on an event which was last
-                                  ///< recorded in a capturing stream.
-    hipErrorStreamCaptureWrongThread = 908,  ///< A stream capture sequence not initiated with
-                                             ///< the hipStreamCaptureModeRelaxed argument to
-                                             ///< hipStreamBeginCapture was passed to
-                                             ///< hipStreamEndCapture in a different thread.
-    hipErrorGraphExecUpdateFailure = 910,  ///< This error indicates that the graph update
-                                           ///< not performed because it included changes which
-                                           ///< violated constraints specific to instantiated graph
-                                           ///< update.
-    hipErrorUnknown = 999,  //< Unknown error.
-    // HSA Runtime Error Codes start here.
-    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
-                                   ///< in production systems.
-    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
-                                  ///< not seen in production systems.
-    hipErrorTbd  ///< Marker that more error codes are needed.
+typedef enum __HIP_NODISCARD hipError_t
+{
+  hipSuccess = 0,                            // Successful completion.
+  hipErrorInvalidValue = 1,                  // One or more of the parameters passed to the API call is NULL
+                                             // or not in an acceptable range.
+  hipErrorOutOfMemory = 2,
+  // Deprecated
+  hipErrorMemoryAllocation = 2,              // Memory allocation error.
+  hipErrorNotInitialized = 3,
+  // Deprecated
+  hipErrorInitializationError = 3,
+  hipErrorDeinitialized = 4,
+  hipErrorProfilerDisabled = 5,
+  hipErrorProfilerNotInitialized = 6,
+  hipErrorProfilerAlreadyStarted = 7,
+  hipErrorProfilerAlreadyStopped = 8,
+  hipErrorInvalidConfiguration = 9,
+  hipErrorInvalidPitchValue = 12,
+  hipErrorInvalidSymbol = 13,
+  hipErrorInvalidDevicePointer = 17,         // Invalid Device Pointer
+  hipErrorInvalidMemcpyDirection = 21,       // Invalid memory copy direction
+  hipErrorInsufficientDriver = 35,
+  hipErrorMissingConfiguration = 52,
+  hipErrorPriorLaunchFailure = 53,
+  hipErrorInvalidDeviceFunction = 98,
+  hipErrorNoDevice = 100,                    // Call to hipGetDeviceCount returned 0 devices
+  hipErrorInvalidDevice = 101,               // DeviceID must be in range 0...#compute-devices.
+  hipErrorInvalidImage = 200,
+  hipErrorInvalidContext = 201,              // Produced when input context is invalid.
+  hipErrorContextAlreadyCurrent = 202,
+  hipErrorMapFailed = 205,
+  // Deprecated
+  hipErrorMapBufferObjectFailed = 205,       // Produced when the IPC memory attach failed from ROCr.
+  hipErrorUnmapFailed = 206,
+  hipErrorArrayIsMapped = 207,
+  hipErrorAlreadyMapped = 208,
+  hipErrorNoBinaryForGpu = 209,
+  hipErrorAlreadyAcquired = 210,
+  hipErrorNotMapped = 211,
+  hipErrorNotMappedAsArray = 212,
+  hipErrorNotMappedAsPointer = 213,
+  hipErrorECCNotCorrectable = 214,
+  hipErrorUnsupportedLimit = 215,
+  hipErrorContextAlreadyInUse = 216,
+  hipErrorPeerAccessUnsupported = 217,
+  hipErrorInvalidKernelFile = 218,           // In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+  hipErrorInvalidGraphicsContext = 219,
+  hipErrorInvalidSource = 300,
+  hipErrorFileNotFound = 301,
+  hipErrorSharedObjectSymbolNotFound = 302,
+  hipErrorSharedObjectInitFailed = 303,
+  hipErrorOperatingSystem = 304,
+  hipErrorInvalidHandle = 400,
+  // Deprecated
+  hipErrorInvalidResourceHandle = 400,       // Resource handle (hipEvent_t or hipStream_t) invalid.
+  hipErrorIllegalState = 401,                // Resource required is not in a valid state to perform operation.
+  hipErrorNotFound = 500,
+  hipErrorNotReady = 600,                    // Indicates that asynchronous operations enqueued earlier are not ready.
+                                             // This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion).
+                                             // APIs that return this error include hipEventQuery and hipStreamQuery.
+  hipErrorIllegalAddress = 700,
+  hipErrorLaunchOutOfResources = 701,        // Out of resources error.
+  hipErrorLaunchTimeOut = 702,
+  hipErrorPeerAccessAlreadyEnabled = 704,    // Peer access was already enabled from the current device.
+  hipErrorPeerAccessNotEnabled = 705,        // Peer access was never enabled from the current device.
+  hipErrorSetOnActiveProcess = 708,
+  hipErrorContextIsDestroyed = 709,
+  hipErrorAssert = 710,                      // Produced when the kernel calls assert.
+  hipErrorHostMemoryAlreadyRegistered = 712, // Produced when trying to lock a page-locked memory.
+  hipErrorHostMemoryNotRegistered = 713,     // Produced when trying to unlock a non-page-locked memory.
+  hipErrorLaunchFailure = 719,               // An exception occurred on the device while executing a kernel.
+  hipErrorCooperativeLaunchTooLarge = 720,   // This error indicates that the number of blocks launched per grid for a kernel
+                                             // that was launched via cooperative launch APIs exceeds the maximum number of
+                                             // allowed blocks for the current device
+  hipErrorNotSupported = 801,                // Produced when the hip API is not supported/implemented
+  hipErrorStreamCaptureUnsupported = 900,    // The operation is not permitted when the stream is capturing.
+  hipErrorStreamCaptureInvalidated = 901,    // The current capture sequence on the stream
+                                             // has been invalidated due to a previous error.
+  hipErrorStreamCaptureMerge = 902,          // The operation would have resulted in a merge of two independent capture sequences.
+  hipErrorStreamCaptureUnmatched = 903,      // The capture was not initiated in this stream.
+  hipErrorStreamCaptureUnjoined = 904,       // The capture sequence contains a fork that was not joined to the primary stream.
+  hipErrorStreamCaptureIsolation = 905,      // A dependency would have been created which crosses the capture sequence boundary.
+                                             // Only implicit in-stream ordering dependencies are allowed to cross the boundary
+  hipErrorStreamCaptureImplicit = 906,       // The operation would have resulted in a disallowed implicit dependency on a current
+                                             // capture sequence from hipStreamLegacy.
+  hipErrorCapturedEvent = 907,               // The operation is not permitted on an event which was last recorded in a capturing stream.
+  hipErrorStreamCaptureWrongThread = 908,    // A stream capture sequence not initiated with the hipStreamCaptureModeRelaxed argument to
+                                             // hipStreamBeginCapture was passed to hipStreamEndCapture in a different thread.
+  hipErrorGraphExecUpdateFailure = 910,      // This error indicates that the graph update not performed because it included changes which
+                                             // violated constraints specific to instantiated graph update.
+  hipErrorUnknown = 999,                     // Unknown error.
+  // HSA Runtime Error Codes start here.
+  hipErrorRuntimeMemory = 1052,              // HSA runtime memory call returned error. Typically not seen in production systems.
+  hipErrorRuntimeOther = 1053,               // HSA runtime call other than memory returned error. Typically not seen in production systems.
+  hipErrorTbd                                // Marker that more error codes are needed.
+
 } hipError_t;
 
 #undef __HIP_NODISCARD
@@ -170,178 +154,178 @@ typedef enum __HIP_NODISCARD hipError_t {
  * hipDeviceAttribute_t
  * hipDeviceAttributeUnused number: 5
  */
-typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeCudaCompatibleBegin = 0,
+typedef enum hipDeviceAttribute_t
+{
+  hipDeviceAttributeCudaCompatibleBegin = 0,
 
-    hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled.
-    hipDeviceAttributeAccessPolicyMaxWindowSize,        ///< Cuda only. The maximum size of the window policy in bytes.
-    hipDeviceAttributeAsyncEngineCount,                 ///< Asynchronous engines number.
-    hipDeviceAttributeCanMapHostMemory,                 ///< Whether host memory can be mapped into device address space
-    hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Device can access host registered memory
-                                                        ///< at the same virtual address as the CPU
-    hipDeviceAttributeClockRate,                        ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeComputeMode,                      ///< Compute mode that device is currently in.
-    hipDeviceAttributeComputePreemptionSupported,       ///< Device supports Compute Preemption.
-    hipDeviceAttributeConcurrentKernels,                ///< Device can possibly execute multiple kernels concurrently.
-    hipDeviceAttributeConcurrentManagedAccess,          ///< Device can coherently access managed memory concurrently with the CPU
-    hipDeviceAttributeCooperativeLaunch,                ///< Support cooperative launch
-    hipDeviceAttributeCooperativeMultiDeviceLaunch,     ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeDeviceOverlap,                    ///< Device can concurrently copy memory and execute a kernel.
-                                                        ///< Deprecated. Use instead asyncEngineCount.
-    hipDeviceAttributeDirectManagedMemAccessFromHost,   ///< Host can directly access managed memory on
-                                                        ///< the device without migration
-    hipDeviceAttributeGlobalL1CacheSupported,           ///< Device supports caching globals in L1
-    hipDeviceAttributeHostNativeAtomicSupported,        ///< Link between the device and the host supports native atomic operations
-    hipDeviceAttributeIntegrated,                       ///< Device is integrated GPU
-    hipDeviceAttributeIsMultiGpuBoard,                  ///< Multiple GPU devices.
-    hipDeviceAttributeKernelExecTimeout,                ///< Run time limit for kernels executed on the device
-    hipDeviceAttributeL2CacheSize,                      ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
-    hipDeviceAttributeLocalL1CacheSupported,            ///< caching locals in L1 is supported
-    hipDeviceAttributeLuid,                             ///< 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeLuidDeviceNodeMask,               ///< Luid device node mask. Undefined on TCC and non-Windows platforms
-    hipDeviceAttributeComputeCapabilityMajor,           ///< Major compute capability version number.
-    hipDeviceAttributeManagedMemory,                    ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeMaxBlocksPerMultiProcessor,       ///< Max block size per multiprocessor
-    hipDeviceAttributeMaxBlockDimX,                     ///< Max block size in width.
-    hipDeviceAttributeMaxBlockDimY,                     ///< Max block size in height.
-    hipDeviceAttributeMaxBlockDimZ,                     ///< Max block size in depth.
-    hipDeviceAttributeMaxGridDimX,                      ///< Max grid size  in width.
-    hipDeviceAttributeMaxGridDimY,                      ///< Max grid size  in height.
-    hipDeviceAttributeMaxGridDimZ,                      ///< Max grid size  in depth.
-    hipDeviceAttributeMaxSurface1D,                     ///< Maximum size of 1D surface.
-    hipDeviceAttributeMaxSurface1DLayered,              ///< Cuda only. Maximum dimensions of 1D layered surface.
-    hipDeviceAttributeMaxSurface2D,                     ///< Maximum dimension (width, height) of 2D surface.
-    hipDeviceAttributeMaxSurface2DLayered,              ///< Cuda only. Maximum dimensions of 2D layered surface.
-    hipDeviceAttributeMaxSurface3D,                     ///< Maximum dimension (width, height, depth) of 3D surface.
-    hipDeviceAttributeMaxSurfaceCubemap,                ///< Cuda only. Maximum dimensions of Cubemap surface.
-    hipDeviceAttributeMaxSurfaceCubemapLayered,         ///< Cuda only. Maximum dimension of Cubemap layered surface.
-    hipDeviceAttributeMaxTexture1DWidth,                ///< Maximum size of 1D texture.
-    hipDeviceAttributeMaxTexture1DLayered,              ///< Maximum dimensions of 1D layered texture.
-    hipDeviceAttributeMaxTexture1DLinear,               ///< Maximum number of elements allocatable in a 1D linear texture.
-                                                        ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
-    hipDeviceAttributeMaxTexture1DMipmap,               ///< Maximum size of 1D mipmapped texture.
-    hipDeviceAttributeMaxTexture2DWidth,                ///< Maximum dimension width of 2D texture.
-    hipDeviceAttributeMaxTexture2DHeight,               ///< Maximum dimension hight of 2D texture.
-    hipDeviceAttributeMaxTexture2DGather,               ///< Maximum dimensions of 2D texture if gather operations  performed.
-    hipDeviceAttributeMaxTexture2DLayered,              ///< Maximum dimensions of 2D layered texture.
-    hipDeviceAttributeMaxTexture2DLinear,               ///< Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
-    hipDeviceAttributeMaxTexture2DMipmap,               ///< Maximum dimensions of 2D mipmapped texture.
-    hipDeviceAttributeMaxTexture3DWidth,                ///< Maximum dimension width of 3D texture.
-    hipDeviceAttributeMaxTexture3DHeight,               ///< Maximum dimension height of 3D texture.
-    hipDeviceAttributeMaxTexture3DDepth,                ///< Maximum dimension depth of 3D texture.
-    hipDeviceAttributeMaxTexture3DAlt,                  ///< Maximum dimensions of alternate 3D texture.
-    hipDeviceAttributeMaxTextureCubemap,                ///< Maximum dimensions of Cubemap texture
-    hipDeviceAttributeMaxTextureCubemapLayered,         ///< Maximum dimensions of Cubemap layered texture.
-    hipDeviceAttributeMaxThreadsDim,                    ///< Maximum dimension of a block
-    hipDeviceAttributeMaxThreadsPerBlock,               ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,      ///< Maximum resident threads per multiprocessor.
-    hipDeviceAttributeMaxPitch,                         ///< Maximum pitch in bytes allowed by memory copies
-    hipDeviceAttributeMemoryBusWidth,                   ///< Global memory bus width in bits.
-    hipDeviceAttributeMemoryClockRate,                  ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeComputeCapabilityMinor,           ///< Minor compute capability version number.
-    hipDeviceAttributeMultiGpuBoardGroupID,             ///< Unique ID of device group on the same multi-GPU board
-    hipDeviceAttributeMultiprocessorCount,              ///< Number of multiprocessors on the device.
-    hipDeviceAttributeUnused1,                          ///< Previously hipDeviceAttributeName
-    hipDeviceAttributePageableMemoryAccess,             ///< Device supports coherently accessing pageable memory
-                                                        ///< without calling hipHostRegister on it
-    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables
-    hipDeviceAttributePciBusId,                         ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,                      ///< PCI Device ID.
-    hipDeviceAttributePciDomainID,                      ///< PCI Domain ID.
-    hipDeviceAttributePersistingL2CacheMaxSize,         ///< Maximum l2 persisting lines capacity in bytes
-    hipDeviceAttributeMaxRegistersPerBlock,             ///< 32-bit registers available to a thread block. This number is shared
-                                                        ///< by all thread blocks simultaneously resident on a multiprocessor.
-    hipDeviceAttributeMaxRegistersPerMultiprocessor,    ///< 32-bit registers available per block.
-    hipDeviceAttributeReservedSharedMemPerBlock,        ///< Shared memory reserved by CUDA driver per block.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,          ///< Maximum shared memory available per block in bytes.
-    hipDeviceAttributeSharedMemPerBlockOptin,           ///< Maximum shared memory per block usable by special opt in.
-    hipDeviceAttributeSharedMemPerMultiprocessor,       ///< Shared memory available per multiprocessor.
-    hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision.
-    hipDeviceAttributeStreamPrioritiesSupported,        ///< Whether to support stream priorities.
-    hipDeviceAttributeSurfaceAlignment,                 ///< Alignment requirement for surfaces
-    hipDeviceAttributeTccDriver,                        ///< Cuda only. Whether device is a Tesla device using TCC driver
-    hipDeviceAttributeTextureAlignment,                 ///< Alignment requirement for textures
-    hipDeviceAttributeTexturePitchAlignment,            ///< Pitch alignment requirement for 2D texture references bound to pitched memory;
-    hipDeviceAttributeTotalConstantMemory,              ///< Constant memory size in bytes.
-    hipDeviceAttributeTotalGlobalMem,                   ///< Global memory available on devicice.
-    hipDeviceAttributeUnifiedAddressing,                ///< Cuda only. An unified address space shared with the host.
-    hipDeviceAttributeUnused2,                          ///< Previously hipDeviceAttributeUuid
-    hipDeviceAttributeWarpSize,                         ///< Warp size in threads.
-    hipDeviceAttributeMemoryPoolsSupported,             ///< Device supports HIP Stream Ordered Memory Allocator
-    hipDeviceAttributeVirtualMemoryManagementSupported, ///< Device supports HIP virtual memory management
-    hipDeviceAttributeHostRegisterSupported,            ///< Can device support host memory registration via hipHostRegister
-    hipDeviceAttributeMemoryPoolSupportedHandleTypes,   ///< Supported handle mask for HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, // Whether ECC support is enabled.
+  hipDeviceAttributeAccessPolicyMaxWindowSize,                // Cuda only. The maximum size of the window policy in bytes.
+  hipDeviceAttributeAsyncEngineCount,                         // Asynchronous engines number.
+  hipDeviceAttributeCanMapHostMemory,                         // Whether host memory can be mapped into device address space
+  hipDeviceAttributeCanUseHostPointerForRegisteredMem,        // Device can access host registered memory
+                                                              // at the same virtual address as the CPU
+  hipDeviceAttributeClockRate,                                // Peak clock frequency in kilohertz.
+  hipDeviceAttributeComputeMode,                              // Compute mode that device is currently in.
+  hipDeviceAttributeComputePreemptionSupported,               // Device supports Compute Preemption.
+  hipDeviceAttributeConcurrentKernels,                        // Device can possibly execute multiple kernels concurrently.
+  hipDeviceAttributeConcurrentManagedAccess,                  // Device can coherently access managed memory concurrently with the CPU
+  hipDeviceAttributeCooperativeLaunch,                        // Support cooperative launch
+  hipDeviceAttributeCooperativeMultiDeviceLaunch,             // Support cooperative launch on multiple devices
+  hipDeviceAttributeDeviceOverlap,                            // Device can concurrently copy memory and execute a kernel.
+                                                              // Deprecated. Use instead asyncEngineCount.
+  hipDeviceAttributeDirectManagedMemAccessFromHost,           // Host can directly access managed memory on
+                                                              // the device without migration
+  hipDeviceAttributeGlobalL1CacheSupported,                   // Device supports caching globals in L1
+  hipDeviceAttributeHostNativeAtomicSupported,                // Link between the device and the host supports native atomic operations
+  hipDeviceAttributeIntegrated,                               // Device is integrated GPU
+  hipDeviceAttributeIsMultiGpuBoard,                          // Multiple GPU devices.
+  hipDeviceAttributeKernelExecTimeout,                        // Run time limit for kernels executed on the device
+  hipDeviceAttributeL2CacheSize,                              // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
+  hipDeviceAttributeLocalL1CacheSupported,                    // caching locals in L1 is supported
+  hipDeviceAttributeLuid,                                     // 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeLuidDeviceNodeMask,                       // Luid device node mask. Undefined on TCC and non-Windows platforms
+  hipDeviceAttributeComputeCapabilityMajor,                   // Major compute capability version number.
+  hipDeviceAttributeManagedMemory,                            // Device supports allocating managed memory on this system
+  hipDeviceAttributeMaxBlocksPerMultiProcessor,               // Max block size per multiprocessor
+  hipDeviceAttributeMaxBlockDimX,                             // Max block size in width.
+  hipDeviceAttributeMaxBlockDimY,                             // Max block size in height.
+  hipDeviceAttributeMaxBlockDimZ,                             // Max block size in depth.
+  hipDeviceAttributeMaxGridDimX,                              // Max grid size  in width.
+  hipDeviceAttributeMaxGridDimY,                              // Max grid size  in height.
+  hipDeviceAttributeMaxGridDimZ,                              // Max grid size  in depth.
+  hipDeviceAttributeMaxSurface1D,                             // Maximum size of 1D surface.
+  hipDeviceAttributeMaxSurface1DLayered,                      // Cuda only. Maximum dimensions of 1D layered surface.
+  hipDeviceAttributeMaxSurface2D,                             // Maximum dimension (width, height) of 2D surface.
+  hipDeviceAttributeMaxSurface2DLayered,                      // Cuda only. Maximum dimensions of 2D layered surface.
+  hipDeviceAttributeMaxSurface3D,                             // Maximum dimension (width, height, depth) of 3D surface.
+  hipDeviceAttributeMaxSurfaceCubemap,                        // Cuda only. Maximum dimensions of Cubemap surface.
+  hipDeviceAttributeMaxSurfaceCubemapLayered,                 // Cuda only. Maximum dimension of Cubemap layered surface.
+  hipDeviceAttributeMaxTexture1DWidth,                        // Maximum size of 1D texture.
+  hipDeviceAttributeMaxTexture1DLayered,                      // Maximum dimensions of 1D layered texture.
+  hipDeviceAttributeMaxTexture1DLinear,                       // Maximum number of elements allocatable in a 1D linear texture.
+                                                              // Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda.
+  hipDeviceAttributeMaxTexture1DMipmap,                       // Maximum size of 1D mipmapped texture.
+  hipDeviceAttributeMaxTexture2DWidth,                        // Maximum dimension width of 2D texture.
+  hipDeviceAttributeMaxTexture2DHeight,                       // Maximum dimension hight of 2D texture.
+  hipDeviceAttributeMaxTexture2DGather,                       // Maximum dimensions of 2D texture if gather operations  performed.
+  hipDeviceAttributeMaxTexture2DLayered,                      // Maximum dimensions of 2D layered texture.
+  hipDeviceAttributeMaxTexture2DLinear,                       // Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory.
+  hipDeviceAttributeMaxTexture2DMipmap,                       // Maximum dimensions of 2D mipmapped texture.
+  hipDeviceAttributeMaxTexture3DWidth,                        // Maximum dimension width of 3D texture.
+  hipDeviceAttributeMaxTexture3DHeight,                       // Maximum dimension height of 3D texture.
+  hipDeviceAttributeMaxTexture3DDepth,                        // Maximum dimension depth of 3D texture.
+  hipDeviceAttributeMaxTexture3DAlt,                          // Maximum dimensions of alternate 3D texture.
+  hipDeviceAttributeMaxTextureCubemap,                        // Maximum dimensions of Cubemap texture
+  hipDeviceAttributeMaxTextureCubemapLayered,                 // Maximum dimensions of Cubemap layered texture.
+  hipDeviceAttributeMaxThreadsDim,                            // Maximum dimension of a block
+  hipDeviceAttributeMaxThreadsPerBlock,                       // Maximum number of threads per block.
+  hipDeviceAttributeMaxThreadsPerMultiProcessor,              // Maximum resident threads per multiprocessor.
+  hipDeviceAttributeMaxPitch,                                 // Maximum pitch in bytes allowed by memory copies
+  hipDeviceAttributeMemoryBusWidth,                           // Global memory bus width in bits.
+  hipDeviceAttributeMemoryClockRate,                          // Peak memory clock frequency in kilohertz.
+  hipDeviceAttributeComputeCapabilityMinor,                   // Minor compute capability version number.
+  hipDeviceAttributeMultiGpuBoardGroupID,                     // Unique ID of device group on the same multi-GPU board
+  hipDeviceAttributeMultiprocessorCount,                      // Number of multiprocessors on the device.
+  hipDeviceAttributeUnused1,                                  // Previously hipDeviceAttributeName
+  hipDeviceAttributePageableMemoryAccess,                     // Device supports coherently accessing pageable memory
+                                                              // without calling hipHostRegister on it
+  hipDeviceAttributePageableMemoryAccessUsesHostPageTables,   // Device accesses pageable memory via the host's page tables
+  hipDeviceAttributePciBusId,                                 // PCI Bus ID.
+  hipDeviceAttributePciDeviceId,                              // PCI Device ID.
+  hipDeviceAttributePciDomainID,                              // PCI Domain ID.
+  hipDeviceAttributePersistingL2CacheMaxSize,                 // Maximum l2 persisting lines capacity in bytes
+  hipDeviceAttributeMaxRegistersPerBlock,                     // 32-bit registers available to a thread block. This number is shared
+                                                              // by all thread blocks simultaneously resident on a multiprocessor.
+  hipDeviceAttributeMaxRegistersPerMultiprocessor,            // 32-bit registers available per block.
+  hipDeviceAttributeReservedSharedMemPerBlock,                // Shared memory reserved by CUDA driver per block.
+  hipDeviceAttributeMaxSharedMemoryPerBlock,                  // Maximum shared memory available per block in bytes.
+  hipDeviceAttributeSharedMemPerBlockOptin,                   // Maximum shared memory per block usable by special opt in.
+  hipDeviceAttributeSharedMemPerMultiprocessor,               // Shared memory available per multiprocessor.
+  hipDeviceAttributeSingleToDoublePrecisionPerfRatio,         // Cuda only. Performance ratio of single precision to double precision.
+  hipDeviceAttributeStreamPrioritiesSupported,                // Whether to support stream priorities.
+  hipDeviceAttributeSurfaceAlignment,                         // Alignment requirement for surfaces
+  hipDeviceAttributeTccDriver,                                // Cuda only. Whether device is a Tesla device using TCC driver
+  hipDeviceAttributeTextureAlignment,                         // Alignment requirement for textures
+  hipDeviceAttributeTexturePitchAlignment,                    // Pitch alignment requirement for 2D texture references bound to pitched memory;
+  hipDeviceAttributeTotalConstantMemory,                      // Constant memory size in bytes.
+  hipDeviceAttributeTotalGlobalMem,                           // Global memory available on devicice.
+  hipDeviceAttributeUnifiedAddressing,                        // Cuda only. An unified address space shared with the host.
+  hipDeviceAttributeUnused2,                                  // Previously hipDeviceAttributeUuid
+  hipDeviceAttributeWarpSize,                                 // Warp size in threads.
+  hipDeviceAttributeMemoryPoolsSupported,                     // Device supports HIP Stream Ordered Memory Allocator
+  hipDeviceAttributeVirtualMemoryManagementSupported,         // Device supports HIP virtual memory management
+  hipDeviceAttributeHostRegisterSupported,                    // Can device support host memory registration via hipHostRegister
+  hipDeviceAttributeMemoryPoolSupportedHandleTypes,           // Supported handle mask for HIP Stream Ordered Memory Allocator
 
-    hipDeviceAttributeCudaCompatibleEnd = 9999,
-    hipDeviceAttributeAmdSpecificBegin = 10000,
+  hipDeviceAttributeCudaCompatibleEnd = 9999,
+  hipDeviceAttributeAmdSpecificBegin = 10000,
 
-    hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  ///< Frequency in khz of the timer used by the device-side "clock*"
-    hipDeviceAttributeUnused3,                                  ///< Previously hipDeviceAttributeArch
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         ///< Maximum Shared Memory PerMultiprocessor.
-    hipDeviceAttributeUnused4,                                  ///< Previously hipDeviceAttributeGcnArch
-    hipDeviceAttributeUnused5,                                  ///< Previously hipDeviceAttributeGcnArchName
-    hipDeviceAttributeHdpMemFlushCntl,                          ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeHdpRegFlushCntl,                          ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched functions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched grid dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
-                                                                ///< devices with unmatched shared memories
-    hipDeviceAttributeIsLargeBar,                               ///< Whether it is LargeBar
-    hipDeviceAttributeAsicRevision,                             ///< Revision of the GPU in this device
-    hipDeviceAttributeCanUseStreamWaitValue,                    ///< '1' if Device supports hipStreamWaitValue32() and
-                                                                ///< hipStreamWaitValue64(), '0' otherwise.
-    hipDeviceAttributeImageSupport,                             ///< '1' if Device supports image, '0' otherwise.
-    hipDeviceAttributePhysicalMultiProcessorCount,              ///< All available physical compute
-                                                                ///< units for the device
-    hipDeviceAttributeFineGrainSupport,                         ///< '1' if Device supports fine grain, '0' otherwise
-    hipDeviceAttributeWallClockRate,                            ///< Constant frequency of wall clock in kilohertz.
+  hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin,  // Frequency in khz of the timer used by the device-side "clock*"
+  hipDeviceAttributeUnused3,                                  // Previously hipDeviceAttributeArch
+  hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,         // Maximum Shared Memory PerMultiprocessor.
+  hipDeviceAttributeUnused4,                                  // Previously hipDeviceAttributeGcnArch
+  hipDeviceAttributeUnused5,                                  // Previously hipDeviceAttributeGcnArchName
+  hipDeviceAttributeHdpMemFlushCntl,                          // Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeHdpRegFlushCntl,                          // Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,      // Supports cooperative launch on multiple devices with unmatched functions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,   // Supports cooperative launch on multiple devices with unmatched grid dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,  // Supports cooperative launch on multiple devices with unmatched block dimensions
+  hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, // Supports cooperative launch on multiple devices with unmatched shared memories
+  hipDeviceAttributeIsLargeBar,                               // Whether it is LargeBar
+  hipDeviceAttributeAsicRevision,                             // Revision of the GPU in this device
+  hipDeviceAttributeCanUseStreamWaitValue,                    // '1' if Device supports hipStreamWaitValue32() and hipStreamWaitValue64(), '0' otherwise.
+  hipDeviceAttributeImageSupport,                             // '1' if Device supports image, '0' otherwise.
+  hipDeviceAttributePhysicalMultiProcessorCount,              // All available physical compute units for the device
+  hipDeviceAttributeFineGrainSupport,                         // '1' if Device supports fine grain, '0' otherwise
+  hipDeviceAttributeWallClockRate,                            // Constant frequency of wall clock in kilohertz.
+
+  hipDeviceAttributeAmdSpecificEnd = 19999,
+  hipDeviceAttributeVendorSpecificBegin = 20000,
+  // Extended attributes for vendors
 
-    hipDeviceAttributeAmdSpecificEnd = 19999,
-    hipDeviceAttributeVendorSpecificBegin = 20000,
-    // Extended attributes for vendors
 } hipDeviceAttribute_t;
 
 /**
  * hipDeviceArch_t
  *
  */
-typedef struct {
-    // 32-bit Atomics
-    unsigned hasGlobalInt32Atomics : 1;     ///< 32-bit integer atomics for global memory.
-    unsigned hasGlobalFloatAtomicExch : 1;  ///< 32-bit float atomic exch for global memory.
-    unsigned hasSharedInt32Atomics : 1;     ///< 32-bit integer atomics for shared memory.
-    unsigned hasSharedFloatAtomicExch : 1;  ///< 32-bit float atomic exch for shared memory.
-    unsigned hasFloatAtomicAdd : 1;  ///< 32-bit float atomic add in global and shared memory.
+typedef struct
+{
+  // 32-bit Atomics
+  unsigned hasGlobalInt32Atomics : 1;     // 32-bit integer atomics for global memory.
+  unsigned hasGlobalFloatAtomicExch : 1;  // 32-bit float atomic exch for global memory.
+  unsigned hasSharedInt32Atomics : 1;     // 32-bit integer atomics for shared memory.
+  unsigned hasSharedFloatAtomicExch : 1;  // 32-bit float atomic exch for shared memory.
+  unsigned hasFloatAtomicAdd : 1;         // 32-bit float atomic add in global and shared memory.
 
-    // 64-bit Atomics
-    unsigned hasGlobalInt64Atomics : 1;  ///< 64-bit integer atomics for global memory.
-    unsigned hasSharedInt64Atomics : 1;  ///< 64-bit integer atomics for shared memory.
+  // 64-bit Atomics
+  unsigned hasGlobalInt64Atomics : 1;     // 64-bit integer atomics for global memory.
+  unsigned hasSharedInt64Atomics : 1;     // 64-bit integer atomics for shared memory.
 
-    // Doubles
-    unsigned hasDoubles : 1;  ///< Double-precision floating point.
+  // Doubles
+  unsigned hasDoubles : 1;                // Double-precision floating point.
 
-    // Warp cross-lane operations
-    unsigned hasWarpVote : 1;     ///< Warp vote instructions (__any, __all).
-    unsigned hasWarpBallot : 1;   ///< Warp ballot instructions (__ballot).
-    unsigned hasWarpShuffle : 1;  ///< Warp shuffle operations. (__shfl_*).
-    unsigned hasFunnelShift : 1;  ///< Funnel two words into one with shift&mask caps.
+  // Warp cross-lane operations
+  unsigned hasWarpVote : 1;               // Warp vote instructions (__any, __all).
+  unsigned hasWarpBallot : 1;             // Warp ballot instructions (__ballot).
+  unsigned hasWarpShuffle : 1;            // Warp shuffle operations. (__shfl_*).
+  unsigned hasFunnelShift : 1;            // Funnel two words into one with shift&mask caps.
 
-    // Sync
-    unsigned hasThreadFenceSystem : 1;  ///< __threadfence_system.
-    unsigned hasSyncThreadsExt : 1;     ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+  // Sync
+  unsigned hasThreadFenceSystem : 1;      // __threadfence_system.
+  unsigned hasSyncThreadsExt : 1;         // __syncthreads_count, syncthreads_and, syncthreads_or.
+
+  // Misc
+  unsigned hasSurfaceFuncs : 1;           // Surface functions.
+  unsigned has3dGrid : 1;                 // Grid and group dims are 3D (rather than 2D).
+  unsigned hasDynamicParallelism : 1;     // Dynamic parallelism.
 
-    // Misc
-    unsigned hasSurfaceFuncs : 1;        ///< Surface functions.
-    unsigned has3dGrid : 1;              ///< Grid and group dims are 3D (rather than 2D).
-    unsigned hasDynamicParallelism : 1;  ///< Dynamic parallelism.
 } hipDeviceArch_t;
 
-typedef struct hipUUID_t {
-    char bytes[16];
+typedef struct hipUUID_t
+{
+  char bytes[16];
+
 } hipUUID;
 
 
@@ -349,149 +333,127 @@ typedef struct hipUUID_t {
  * hipDeviceProp
  *
  */
-typedef struct hipDeviceProp_t {
-    char name[256];                   ///< Device name.
-    hipUUID uuid;                     ///< UUID of a device
-    char luid[8];                     ///< 8-byte unique identifier. Only valid on windows
-    unsigned int luidDeviceNodeMask;  ///< LUID node mask
-    size_t totalGlobalMem;            ///< Size of global memory region (in bytes).
-    size_t sharedMemPerBlock;         ///< Size of shared memory per block (in bytes).
-    int regsPerBlock;                 ///< Registers per block.
-    int warpSize;                     ///< Warp size.
-    size_t memPitch;                  ///< Maximum pitch in bytes allowed by memory copies
-                                      ///< pitched memory
-    int maxThreadsPerBlock;           ///< Max work items per work group or workgroup max size.
-    int maxThreadsDim[3];             ///< Max number of threads in each dimension (XYZ) of a block.
-    int maxGridSize[3];               ///< Max grid dimensions (XYZ).
-    int clockRate;                    ///< Max clock frequency of the multiProcessors in khz.
-    size_t totalConstMem;             ///< Size of shared constant memory region on the device
-                                      ///< (in bytes).
-    int major;  ///< Major compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    int minor;  ///< Minor compute capability.  On HCC, this is an approximation and features may
-                ///< differ from CUDA CC.  See the arch feature flags for portable ways to query
-                ///< feature caps.
-    size_t textureAlignment;       ///< Alignment requirement for textures
-    size_t texturePitchAlignment;  ///< Pitch alignment requirement for texture references bound to
-    int deviceOverlap;             ///< Deprecated. Use asyncEngineCount instead
-    int multiProcessorCount;       ///< Number of multi-processors (compute units).
-    int kernelExecTimeoutEnabled;  ///< Run time limit for kernels executed on the device
-    int integrated;                ///< APU vs dGPU
-    int canMapHostMemory;          ///< Check whether HIP can map host memory
-    int computeMode;               ///< Compute mode.
-    int maxTexture1D;              ///< Maximum number of elements in 1D images
-    int maxTexture1DMipmap;        ///< Maximum 1D mipmap texture size
-    int maxTexture1DLinear;        ///< Maximum size for 1D textures bound to linear memory
-    int maxTexture2D[2];  ///< Maximum dimensions (width, height) of 2D images, in image elements
-    int maxTexture2DMipmap[2];  ///< Maximum number of elements in 2D array mipmap of images
-    int maxTexture2DLinear[3];  ///< Maximum 2D tex dimensions if tex are bound to pitched memory
-    int maxTexture2DGather[2];  ///< Maximum 2D tex dimensions if gather has to be performed
-    int maxTexture3D[3];  ///< Maximum dimensions (width, height, depth) of 3D images, in image
-                          ///< elements
-    int maxTexture3DAlt[3];           ///< Maximum alternate 3D texture dims
-    int maxTextureCubemap;            ///< Maximum cubemap texture dims
-    int maxTexture1DLayered[2];       ///< Maximum number of elements in 1D array images
-    int maxTexture2DLayered[3];       ///< Maximum number of elements in 2D array images
-    int maxTextureCubemapLayered[2];  ///< Maximum cubemaps layered texture dims
-    int maxSurface1D;                 ///< Maximum 1D surface size
-    int maxSurface2D[2];              ///< Maximum 2D surface size
-    int maxSurface3D[3];              ///< Maximum 3D surface size
-    int maxSurface1DLayered[2];       ///< Maximum 1D layered surface size
-    int maxSurface2DLayered[3];       ///< Maximum 2D layared surface size
-    int maxSurfaceCubemap;            ///< Maximum cubemap surface size
-    int maxSurfaceCubemapLayered[2];  ///< Maximum cubemap layered surface size
-    size_t surfaceAlignment;          ///< Alignment requirement for surface
-    int concurrentKernels;         ///< Device can possibly execute multiple kernels concurrently.
-    int ECCEnabled;                ///< Device has ECC support enabled
-    int pciBusID;                  ///< PCI Bus ID.
-    int pciDeviceID;               ///< PCI Device ID.
-    int pciDomainID;               ///< PCI Domain ID
-    int tccDriver;                 ///< 1:If device is Tesla device using TCC driver, else 0
-    int asyncEngineCount;          ///< Number of async engines
-    int unifiedAddressing;         ///< Does device and host share unified address space
-    int memoryClockRate;           ///< Max global memory clock frequency in khz.
-    int memoryBusWidth;            ///< Global memory bus width in bits.
-    int l2CacheSize;               ///< L2 cache size.
-    int persistingL2CacheMaxSize;  ///< Device's max L2 persisting lines in bytes
-    int maxThreadsPerMultiProcessor;    ///< Maximum resident threads per multi-processor.
-    int streamPrioritiesSupported;      ///< Device supports stream priority
-    int globalL1CacheSupported;         ///< Indicates globals are cached in L1
-    int localL1CacheSupported;          ///< Locals are cahced in L1
-    size_t sharedMemPerMultiprocessor;  ///< Amount of shared memory available per multiprocessor.
-    int regsPerMultiprocessor;          ///< registers available per multiprocessor
-    int managedMemory;         ///< Device supports allocating managed memory on this system
-    int isMultiGpuBoard;       ///< 1 if device is on a multi-GPU board, 0 if not.
-    int multiGpuBoardGroupID;  ///< Unique identifier for a group of devices on same multiboard GPU
-    int hostNativeAtomicSupported;         ///< Link between host and device supports native atomics
-    int singleToDoublePrecisionPerfRatio;  ///< Deprecated. CUDA only.
-    int pageableMemoryAccess;              ///< Device supports coherently accessing pageable memory
-                                           ///< without calling hipHostRegister on it
-    int concurrentManagedAccess;  ///< Device can coherently access managed memory concurrently with
-                                  ///< the CPU
-    int computePreemptionSupported;         ///< Is compute preemption supported on the device
-    int canUseHostPointerForRegisteredMem;  ///< Device can access host registered memory with same
-                                            ///< address as the host
-    int cooperativeLaunch;                  ///< HIP device supports cooperative launch
-    int cooperativeMultiDeviceLaunch;       ///< HIP device supports cooperative launch on multiple
-                                            ///< devices
-    size_t
-        sharedMemPerBlockOptin;  ///< Per device m ax shared mem per block usable by special opt in
-    int pageableMemoryAccessUsesHostPageTables;  ///< Device accesses pageable memory via the host's
-                                                 ///< page tables
-    int directManagedMemAccessFromHost;  ///< Host can directly access managed memory on the device
-                                         ///< without migration
-    int maxBlocksPerMultiProcessor;      ///< Max number of blocks on CU
-    int accessPolicyMaxWindowSize;       ///< Max value of access policy window
-    size_t reservedSharedMemPerBlock;    ///< Shared memory reserved by driver per block
-    int hostRegisterSupported;           ///< Device supports hipHostRegister
-    int sparseHipArraySupported;         ///< Indicates if device supports sparse hip arrays
-    int hostRegisterReadOnlySupported;   ///< Device supports using the hipHostRegisterReadOnly flag
-                                         ///< with hipHostRegistger
-    int timelineSemaphoreInteropSupported;  ///< Indicates external timeline semaphore support
-    int memoryPoolsSupported;  ///< Indicates if device supports hipMallocAsync and hipMemPool APIs
-    int gpuDirectRDMASupported;                    ///< Indicates device support of RDMA APIs
-    unsigned int gpuDirectRDMAFlushWritesOptions;  ///< Bitmask to be interpreted according to
-                                                   ///< hipFlushGPUDirectRDMAWritesOptions
-    int gpuDirectRDMAWritesOrdering;               ///< value of hipGPUDirectRDMAWritesOrdering
-    unsigned int
-        memoryPoolSupportedHandleTypes;  ///< Bitmask of handle types support with mempool based IPC
-    int deferredMappingHipArraySupported;  ///< Device supports deferred mapping HIP arrays and HIP
-                                           ///< mipmapped arrays
-    int ipcEventSupported;                 ///< Device supports IPC events
-    int clusterLaunch;                     ///< Device supports cluster launch
-    int unifiedFunctionPointers;           ///< Indicates device supports unified function pointers
-    int reserved[63];                      ///< CUDA Reserved.
+typedef struct hipDeviceProp_t
+{
+  char name[256];                                // Device name.
+  hipUUID uuid;                                  // UUID of a device
+  char luid[8];                                  // 8-byte unique identifier. Only valid on windows
+  unsigned int luidDeviceNodeMask;               // LUID node mask
+  size_t totalGlobalMem;                         // Size of global memory region (in bytes).
+  size_t sharedMemPerBlock;                      // Size of shared memory per block (in bytes).
+  int regsPerBlock;                              // Registers per block.
+  int warpSize;                                  // Warp size.
+  size_t memPitch;                               // Maximum pitch in bytes allowed by memory copies pitched memory
+  int maxThreadsPerBlock;                        // Max work items per work group or workgroup max size.
+  int maxThreadsDim[3];                          // Max number of threads in each dimension (XYZ) of a block.
+  int maxGridSize[3];                            // Max grid dimensions (XYZ).
+  int clockRate;                                 // Max clock frequency of the multiProcessors in khz.
+  size_t totalConstMem;                          // Size of shared constant memory region on the device (in bytes).
+  int major;                                     // Major compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  int minor;                                     // Minor compute capability. On HCC, this is an approximation and features may
+                                                 // differ from CUDA CC.  See the arch feature flags for portable ways to query feature caps.
+  size_t textureAlignment;                       // Alignment requirement for textures
+  size_t texturePitchAlignment;                  // Pitch alignment requirement for texture references bound to
+  int deviceOverlap;                             // Deprecated. Use asyncEngineCount instead
+  int multiProcessorCount;                       // Number of multi-processors (compute units).
+  int kernelExecTimeoutEnabled;                  // Run time limit for kernels executed on the device
+  int integrated;                                // APU vs dGPU
+  int canMapHostMemory;                          // Check whether HIP can map host memory
+  int computeMode;                               // Compute mode.
+  int maxTexture1D;                              // Maximum number of elements in 1D images
+  int maxTexture1DMipmap;                        // Maximum 1D mipmap texture size
+  int maxTexture1DLinear;                        // Maximum size for 1D textures bound to linear memory
+  int maxTexture2D[2];                           // Maximum dimensions (width, height) of 2D images, in image elements
+  int maxTexture2DMipmap[2];                     // Maximum number of elements in 2D array mipmap of images
+  int maxTexture2DLinear[3];                     // Maximum 2D tex dimensions if tex are bound to pitched memory
+  int maxTexture2DGather[2];                     // Maximum 2D tex dimensions if gather has to be performed
+  int maxTexture3D[3];                           // Maximum dimensions (width, height, depth) of 3D images, in image elements
+  int maxTexture3DAlt[3];                        // Maximum alternate 3D texture dims
+  int maxTextureCubemap;                         // Maximum cubemap texture dims
+  int maxTexture1DLayered[2];                    // Maximum number of elements in 1D array images
+  int maxTexture2DLayered[3];                    // Maximum number of elements in 2D array images
+  int maxTextureCubemapLayered[2];               // Maximum cubemaps layered texture dims
+  int maxSurface1D;                              // Maximum 1D surface size
+  int maxSurface2D[2];                           // Maximum 2D surface size
+  int maxSurface3D[3];                           // Maximum 3D surface size
+  int maxSurface1DLayered[2];                    // Maximum 1D layered surface size
+  int maxSurface2DLayered[3];                    // Maximum 2D layared surface size
+  int maxSurfaceCubemap;                         // Maximum cubemap surface size
+  int maxSurfaceCubemapLayered[2];               // Maximum cubemap layered surface size
+  size_t surfaceAlignment;                       // Alignment requirement for surface
+  int concurrentKernels;                         // Device can possibly execute multiple kernels concurrently.
+  int ECCEnabled;                                // Device has ECC support enabled
+  int pciBusID;                                  // PCI Bus ID.
+  int pciDeviceID;                               // PCI Device ID.
+  int pciDomainID;                               // PCI Domain ID
+  int tccDriver;                                 // 1:If device is Tesla device using TCC driver, else 0
+  int asyncEngineCount;                          // Number of async engines
+  int unifiedAddressing;                         // Does device and host share unified address space
+  int memoryClockRate;                           // Max global memory clock frequency in khz.
+  int memoryBusWidth;                            // Global memory bus width in bits.
+  int l2CacheSize;                               // L2 cache size.
+  int persistingL2CacheMaxSize;                  // Device's max L2 persisting lines in bytes
+  int maxThreadsPerMultiProcessor;               // Maximum resident threads per multi-processor.
+  int streamPrioritiesSupported;                 // Device supports stream priority
+  int globalL1CacheSupported;                    // Indicates globals are cached in L1
+  int localL1CacheSupported;                     // Locals are cahced in L1
+  size_t sharedMemPerMultiprocessor;             // Amount of shared memory available per multiprocessor.
+  int regsPerMultiprocessor;                     // registers available per multiprocessor
+  int managedMemory;                             // Device supports allocating managed memory on this system
+  int isMultiGpuBoard;                           // 1 if device is on a multi-GPU board, 0 if not.
+  int multiGpuBoardGroupID;                      // Unique identifier for a group of devices on same multiboard GPU
+  int hostNativeAtomicSupported;                 // Link between host and device supports native atomics
+  int singleToDoublePrecisionPerfRatio;          // Deprecated. CUDA only.
+  int pageableMemoryAccess;                      // Device supports coherently accessing pageable memory
+                                                 // without calling hipHostRegister on it
+  int concurrentManagedAccess;                   // Device can coherently access managed memory concurrently with the CPU
+  int computePreemptionSupported;                // Is compute preemption supported on the device
+  int canUseHostPointerForRegisteredMem;         // Device can access host registered memory with same address as the host
+  int cooperativeLaunch;                         // HIP device supports cooperative launch
+  int cooperativeMultiDeviceLaunch;              // HIP device supports cooperative launch on multiple devices
+  size_t sharedMemPerBlockOptin;                 // Per device m ax shared mem per block usable by special opt in
+  int pageableMemoryAccessUsesHostPageTables;    // Device accesses pageable memory via the host's page tables
+  int directManagedMemAccessFromHost;            // Host can directly access managed memory on the device without migration
+  int maxBlocksPerMultiProcessor;                // Max number of blocks on CU
+  int accessPolicyMaxWindowSize;                 // Max value of access policy window
+  size_t reservedSharedMemPerBlock;              // Shared memory reserved by driver per block
+  int hostRegisterSupported;                     // Device supports hipHostRegister
+  int sparseHipArraySupported;                   // Indicates if device supports sparse hip arrays
+  int hostRegisterReadOnlySupported;             // Device supports using the hipHostRegisterReadOnly flag with hipHostRegistger
+  int timelineSemaphoreInteropSupported;         // Indicates external timeline semaphore support
+  int memoryPoolsSupported;                      // Indicates if device supports hipMallocAsync and hipMemPool APIs
+  int gpuDirectRDMASupported;                    // Indicates device support of RDMA APIs
+  unsigned int gpuDirectRDMAFlushWritesOptions;  // Bitmask to be interpreted according to hipFlushGPUDirectRDMAWritesOptions
+  int gpuDirectRDMAWritesOrdering;               // value of hipGPUDirectRDMAWritesOrdering
+  unsigned int memoryPoolSupportedHandleTypes;   // Bitmask of handle types support with mempool based IPC
+  int deferredMappingHipArraySupported;          // Device supports deferred mapping HIP arrays and HIP mipmapped arrays
+  int ipcEventSupported;                         // Device supports IPC events
+  int clusterLaunch;                             // Device supports cluster launch
+  int unifiedFunctionPointers;                   // Indicates device supports unified function pointers
+  int reserved[63];                              // CUDA Reserved.
 
-    int hipReserved[32];  ///< Reserved for adding new entries for HIP/CUDA.
+  int hipReserved[32];                           // Reserved for adding new entries for HIP/CUDA.
+
+  /* HIP Only struct members */
+  char gcnArchName[256];                         // AMD GCN Arch Name. HIP Only.
+  size_t maxSharedMemoryPerMultiProcessor;       // Maximum Shared Memory Per CU. HIP Only.
+  int clockInstructionRate;                      // Frequency in khz of the timer used by the device-side "clock*" instructions. New for HIP.
+  hipDeviceArch_t arch;                          // Architectural feature flags.  New for HIP.
+  unsigned int* hdpMemFlushCntl;                 // Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+  unsigned int* hdpRegFlushCntl;                 // Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+  int cooperativeMultiDeviceUnmatchedFunc;       // HIP device supports cooperative launch on multiple devices with unmatched functions
+  int cooperativeMultiDeviceUnmatchedGridDim;    // HIP device supports cooperative launch on multiple devices with unmatched grid dimensions
+  int cooperativeMultiDeviceUnmatchedBlockDim;   // HIP device supports cooperative launch on multiple devices with unmatched block dimensions
+  int cooperativeMultiDeviceUnmatchedSharedMem;  // HIP device supports cooperative launch on multiple devices with unmatched shared memories
+  int isLargeBar;                                // 1: if it is a large PCI bar device, else 0
+  int asicRevision;                              // Revision of the GPU in this device
 
-    /* HIP Only struct members */
-    char gcnArchName[256];                    ///< AMD GCN Arch Name. HIP Only.
-    size_t maxSharedMemoryPerMultiProcessor;  ///< Maximum Shared Memory Per CU. HIP Only.
-    int clockInstructionRate;  ///< Frequency in khz of the timer used by the device-side "clock*"
-                               ///< instructions.  New for HIP.
-    hipDeviceArch_t arch;      ///< Architectural feature flags.  New for HIP.
-    unsigned int* hdpMemFlushCntl;            ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
-    unsigned int* hdpRegFlushCntl;            ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
-    int cooperativeMultiDeviceUnmatchedFunc;  ///< HIP device supports cooperative launch on
-                                              ///< multiple
-                                              /// devices with unmatched functions
-    int cooperativeMultiDeviceUnmatchedGridDim;    ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched grid dimensions
-    int cooperativeMultiDeviceUnmatchedBlockDim;   ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched block dimensions
-    int cooperativeMultiDeviceUnmatchedSharedMem;  ///< HIP device supports cooperative launch on
-                                                   ///< multiple
-                                                   /// devices with unmatched shared memories
-    int isLargeBar;                                ///< 1: if it is a large PCI bar device, else 0
-    int asicRevision;                              ///< Revision of the GPU in this device
 } hipDeviceProp_t;
 
 //Flags that can be used with hipStreamCreateWithFlags.
 /** Default stream creation flags. These are used with hipStreamCreate().*/
-#define hipStreamDefault  0x00
+#define hipStreamDefault 0x00
 
 /** Stream does not implicitly synchronize with null stream.*/
 #define hipStreamNonBlocking 0x01
@@ -511,46 +473,48 @@ typedef struct hipDeviceProp_t {
 
 /** Use a device-scope release when recording this event. This flag is useful to obtain more
  * precise timings of commands between events.  The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToDevice  0x40000000
+#define hipEventReleaseToDevice 0x40000000
 
 /** Use a system-scope release when recording this event. This flag is useful to make
  * non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms.*/
-#define hipEventReleaseToSystem  0x80000000
+#define hipEventReleaseToSystem 0x80000000
 
 /** Automatically select between Spin and Yield.*/
 #define hipDeviceScheduleAuto 0x0
 
 /** Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may
  * consume more power.*/
-#define hipDeviceScheduleSpin  0x1
+#define hipDeviceScheduleSpin 0x1
 
 /** Yield the CPU to the operating system when waiting. May increase latency, but lowers power
  * and is friendlier to other threads in the system.*/
-#define hipDeviceScheduleYield  0x2
+#define hipDeviceScheduleYield 0x2
 #define hipDeviceScheduleBlockingSync 0x4
 #define hipDeviceScheduleMask 0x7
 #define hipDeviceMapHost 0x8
 #define hipDeviceLmemResizeToMax 0x16
 
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
+typedef enum hipJitOption
+{
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions
+
 } hipJitOption;
 
 // stop: hip_runtime_api.h
@@ -563,11 +527,17 @@ typedef enum hipJitOption {
 
 #define HIP_API_CALL HIPAPI
 
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXCREATE)              (hipCtx_t *, unsigned int, hipDevice_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXDESTROY)             (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPOPCURRENT)          (hipCtx_t *);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXPUSHCURRENT)         (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSETCURRENT)          (hipCtx_t);
+// deprecated
 typedef hipError_t (HIP_API_CALL *HIP_HIPCTXSYNCHRONIZE)         (void);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETATTRIBUTE)     (int *, hipDeviceAttribute_t, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETCOUNT)         (int *);
@@ -575,7 +545,8 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGET)              (hipDevice_t *,
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICEGETNAME)          (char *, int, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDEVICETOTALMEM)         (size_t *, hipDevice_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPDRIVERGETVERSION)       (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATE)            (hipEvent_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTCREATEWITHFLAGS)   (hipEvent_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTDESTROY)           (hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTELAPSEDTIME)       (float *, hipEvent_t, hipEvent_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPEVENTRECORD)            (hipEvent_t, hipStream_t);
@@ -588,6 +559,11 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPLAUNCHKERNEL)           (hipFunction_t,
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMALLOC)               (hipDeviceptr_t *, size_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMFREE)                (hipDeviceptr_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMGETINFO)             (size_t *, size_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOD)             (hipDeviceptr_t, hipDeviceptr_t, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOH)             (void *, hipDeviceptr_t, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYHTOD)             (hipDeviceptr_t, const void *, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMSETD32)              (hipDeviceptr_t, unsigned int, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMEMSETD8)               (hipDeviceptr_t, unsigned char, size_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTODASYNC)        (hipDeviceptr_t, hipDeviceptr_t, size_t, hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYDTOHASYNC)        (void *, hipDeviceptr_t, size_t, hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMEMCPYHTODASYNC)        (hipDeviceptr_t, const void *, size_t, hipStream_t);
@@ -598,21 +574,30 @@ typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEGETGLOBAL)        (hipDeviceptr_t
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULELOADDATAEX)       (hipModule_t *, const void *, unsigned int, hipJitOption *, void **);
 typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEUNLOAD)           (hipModule_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPRUNTIMEGETVERSION)      (int *);
-typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *, unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICE)              (hipDevice_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSETDEVICEFLAGS)         (unsigned int);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATE)           (hipStream_t *);
+typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMCREATEWITHFLAGS)  (hipStream_t *, unsigned int);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMDESTROY)          (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPSTREAMSYNCHRONIZE)      (hipStream_t);
 typedef hipError_t (HIP_API_CALL *HIP_HIPGETDEVICEPROPERTIES)    (hipDeviceProp_t *, hipDevice_t);
-typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)    (int *, hipFunction_t, int, size_t);
+typedef hipError_t (HIP_API_CALL *HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)  (int *, hipFunction_t, int, size_t);
 
 typedef struct hc_hip_lib
 {
   hc_dynlib_t lib;
 
+  // deprecated
   HIP_HIPCTXCREATE              hipCtxCreate;
+  // deprecated
   HIP_HIPCTXDESTROY             hipCtxDestroy;
+  // deprecated
   HIP_HIPCTXPOPCURRENT          hipCtxPopCurrent;
+  // deprecated
   HIP_HIPCTXPUSHCURRENT         hipCtxPushCurrent;
+  // deprecated
   HIP_HIPCTXSETCURRENT          hipCtxSetCurrent;
+  // deprecated
   HIP_HIPCTXSYNCHRONIZE         hipCtxSynchronize;
   HIP_HIPDEVICEGETATTRIBUTE     hipDeviceGetAttribute;
   HIP_HIPDEVICEGETCOUNT         hipDeviceGetCount;
@@ -621,6 +606,7 @@ typedef struct hc_hip_lib
   HIP_HIPDEVICETOTALMEM         hipDeviceTotalMem;
   HIP_HIPDRIVERGETVERSION       hipDriverGetVersion;
   HIP_HIPEVENTCREATE            hipEventCreate;
+  HIP_HIPEVENTCREATEWITHFLAGS   hipEventCreateWithFlags;
   HIP_HIPEVENTDESTROY           hipEventDestroy;
   HIP_HIPEVENTELAPSEDTIME       hipEventElapsedTime;
   HIP_HIPEVENTRECORD            hipEventRecord;
@@ -633,6 +619,11 @@ typedef struct hc_hip_lib
   HIP_HIPMEMALLOC               hipMemAlloc;
   HIP_HIPMEMFREE                hipMemFree;
   HIP_HIPMEMGETINFO             hipMemGetInfo;
+  HIP_HIPMEMCPYDTOD             hipMemcpyDtoD;
+  HIP_HIPMEMCPYDTOH             hipMemcpyDtoH;
+  HIP_HIPMEMCPYHTOD             hipMemcpyHtoD;
+  HIP_HIPMEMSETD32              hipMemsetD32;
+  HIP_HIPMEMSETD8               hipMemsetD8;
   HIP_HIPMEMCPYDTODASYNC        hipMemcpyDtoDAsync;
   HIP_HIPMEMCPYDTOHASYNC        hipMemcpyDtoHAsync;
   HIP_HIPMEMCPYHTODASYNC        hipMemcpyHtoDAsync;
@@ -643,57 +634,75 @@ typedef struct hc_hip_lib
   HIP_HIPMODULELOADDATAEX       hipModuleLoadDataEx;
   HIP_HIPMODULEUNLOAD           hipModuleUnload;
   HIP_HIPRUNTIMEGETVERSION      hipRuntimeGetVersion;
+  HIP_HIPSETDEVICE              hipSetDevice;
+  HIP_HIPSETDEVICEFLAGS         hipSetDeviceFlags;
   HIP_HIPSTREAMCREATE           hipStreamCreate;
+  HIP_HIPSTREAMCREATEWITHFLAGS  hipStreamCreateWithFlags;
   HIP_HIPSTREAMDESTROY          hipStreamDestroy;
   HIP_HIPSTREAMSYNCHRONIZE      hipStreamSynchronize;
   HIP_HIPGETDEVICEPROPERTIES    hipGetDeviceProperties;
-  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR 	hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+  HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR  hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
 
 } hc_hip_lib_t;
 
 typedef hc_hip_lib_t HIP_PTR;
 
-int  hip_init                  (void *hashcat_ctx);
-void hip_close                 (void *hashcat_ctx);
+int  hip_init                   (void *hashcat_ctx);
+void hip_close                  (void *hashcat_ctx);
 
-int hc_hipCtxCreate            (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
-int hc_hipCtxDestroy           (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxPopCurrent        (void *hashcat_ctx, hipCtx_t *pctx);
-int hc_hipCtxPushCurrent       (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSetCurrent        (void *hashcat_ctx, hipCtx_t ctx);
-int hc_hipCtxSynchronize       (void *hashcat_ctx);
-int hc_hipDeviceGet            (void *hashcat_ctx, hipDevice_t *device, int ordinal);
-int hc_hipDeviceGetAttribute   (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
-int hc_hipDeviceGetCount       (void *hashcat_ctx, int *count);
-int hc_hipDeviceGetName        (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
-int hc_hipDeviceTotalMem       (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
-int hc_hipDriverGetVersion     (void *hashcat_ctx, int *driverVersion);
-int hc_hipEventCreate          (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
-int hc_hipEventDestroy         (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventElapsedTime     (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
-int hc_hipEventQuery           (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipEventRecord          (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
-int hc_hipEventSynchronize     (void *hashcat_ctx, hipEvent_t hEvent);
-int hc_hipFuncGetAttribute     (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
-int hc_hipInit                 (void *hashcat_ctx, unsigned int Flags);
-int hc_hipLaunchKernel         (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
-int hc_hipMemAlloc             (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
-int hc_hipMemFree              (void *hashcat_ctx, hipDeviceptr_t dptr);
-int hc_hipMemGetInfo           (void *hashcat_ctx, size_t *free, size_t *total);
-int hc_hipMemcpyDtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyDtoHAsync      (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemcpyHtoDAsync      (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
-int hc_hipMemsetD32Async       (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
-int hc_hipMemsetD8Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
-int hc_hipModuleGetFunction    (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
-int hc_hipModuleGetGlobal      (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
-int hc_hipModuleLoadDataEx     (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
-int hc_hipModuleUnload         (void *hashcat_ctx, hipModule_t hmod);
-int hc_hipRuntimeGetVersion    (void *hashcat_ctx, int *runtimeVersion);
-int hc_hipStreamCreate         (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags);
-int hc_hipStreamDestroy        (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipStreamSynchronize    (void *hashcat_ctx, hipStream_t hStream);
-int hc_hipGetDeviceProperties  (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxCreate             (void *hashcat_ctx, hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  // deprecated
+int hc_hipCtxDestroy            (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxPopCurrent         (void *hashcat_ctx, hipCtx_t *pctx);
+  // deprecated
+int hc_hipCtxPushCurrent        (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSetCurrent         (void *hashcat_ctx, hipCtx_t ctx);
+  // deprecated
+int hc_hipCtxSynchronize        (void *hashcat_ctx);
+int hc_hipDeviceGet             (void *hashcat_ctx, hipDevice_t *device, int ordinal);
+int hc_hipDeviceGetAttribute    (void *hashcat_ctx, int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+int hc_hipDeviceGetCount        (void *hashcat_ctx, int *count);
+int hc_hipDeviceGetName         (void *hashcat_ctx, char *name, int len, hipDevice_t dev);
+int hc_hipDeviceTotalMem        (void *hashcat_ctx, size_t *bytes, hipDevice_t dev);
+int hc_hipDriverGetVersion      (void *hashcat_ctx, int *driverVersion);
+int hc_hipEventCreate           (void *hashcat_ctx, hipEvent_t *phEvent);
+int hc_hipEventCreateWithFlags  (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags);
+int hc_hipEventDestroy          (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventElapsedTime      (void *hashcat_ctx, float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+int hc_hipEventQuery            (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipEventRecord           (void *hashcat_ctx, hipEvent_t hEvent, hipStream_t hStream);
+int hc_hipEventSynchronize      (void *hashcat_ctx, hipEvent_t hEvent);
+int hc_hipFuncGetAttribute      (void *hashcat_ctx, int *pi, hipFunction_attribute attrib, hipFunction_t hfunc);
+int hc_hipInit                  (void *hashcat_ctx, unsigned int Flags);
+int hc_hipLaunchKernel          (void *hashcat_ctx, hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+int hc_hipMemAlloc              (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t bytesize);
+int hc_hipMemFree               (void *hashcat_ctx, hipDeviceptr_t dptr);
+int hc_hipMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
+int hc_hipMemcpyDtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyDtoH            (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+int hc_hipMemcpyHtoD            (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+int hc_hipMemsetD32             (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N);
+int hc_hipMemsetD8              (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N);
+int hc_hipMemcpyDtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemcpyHtoDAsync       (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
+int hc_hipMemsetD32Async        (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N, hipStream_t hStream);
+int hc_hipMemsetD8Async         (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N, hipStream_t hStream);
+int hc_hipModuleGetFunction     (void *hashcat_ctx, hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+int hc_hipModuleGetGlobal       (void *hashcat_ctx, hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+int hc_hipModuleLoadDataEx      (void *hashcat_ctx, hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
+int hc_hipModuleUnload          (void *hashcat_ctx, hipModule_t hmod);
+int hc_hipRuntimeGetVersion     (void *hashcat_ctx, int *runtimeVersion);
+int hc_hipSetDevice             (void *hashcat_ctx, hipDevice_t dev);
+int hc_hipSetDeviceFlags        (void *hashcat_ctx, unsigned int flags);
+int hc_hipStreamCreate          (void *hashcat_ctx, hipStream_t *phStream);
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int flags);
+int hc_hipStreamDestroy         (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipStreamSynchronize     (void *hashcat_ctx, hipStream_t hStream);
+int hc_hipGetDeviceProperties   (void *hashcat_ctx, hipDeviceProp_t *prop, hipDevice_t dev);
 int hc_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor (void *hashcat_ctx, int *numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
 
 #endif // HC_EXT_HIP_H
diff --git a/include/ext_metal.h b/include/ext_metal.h
index a7a7d37a9..b51f09ff2 100644
--- a/include/ext_metal.h
+++ b/include/ext_metal.h
@@ -111,7 +111,7 @@ int  hc_mtlCreateLibraryWithFile    (void *hashcat_ctx, mtl_device_id metal_devi
 int  hc_mtlEncodeComputeCommand_pre (void *hashcat_ctx, mtl_pipeline metal_pipeline, mtl_command_queue metal_command_queue, mtl_command_buffer *metal_command_buffer, mtl_command_encoder *metal_command_encoder);
 int  hc_mtlSetCommandEncoderArg     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, size_t off, size_t idx, mtl_mem buf, void *host_data, size_t host_data_size);
 
-int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms);
+int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);
 
 #endif // __APPLE__
 
diff --git a/include/types.h b/include/types.h
index 22c57d85a..c72232906 100644
--- a/include/types.h
+++ b/include/types.h
@@ -115,6 +115,7 @@ typedef enum event_identifier
   EVENT_BRIDGES_SALT_POST         = 0x00000122,
   EVENT_BRIDGES_SALT_PRE          = 0x00000123,
   EVENT_CALCULATED_WORDS_BASE     = 0x00000020,
+  EVENT_CALCULATED_WORDS_CNT      = 0x00000021,
   EVENT_CRACKER_FINISHED          = 0x00000030,
   EVENT_CRACKER_HASH_CRACKED      = 0x00000031,
   EVENT_CRACKER_STARTING          = 0x00000032,
@@ -412,6 +413,9 @@ typedef enum opti_type
   OPTI_TYPE_REGISTER_LIMIT        = (1 << 20), // We'll limit the register count to 128
   OPTI_TYPE_SLOW_HASH_SIMD_INIT2  = (1 << 21),
   OPTI_TYPE_SLOW_HASH_SIMD_LOOP2  = (1 << 22),
+  OPTI_TYPE_SLOW_HASH_DIMY_INIT   = (1 << 23),
+  OPTI_TYPE_SLOW_HASH_DIMY_LOOP   = (1 << 24),
+  OPTI_TYPE_SLOW_HASH_DIMY_COMP   = (1 << 25),
 
 } opti_type_t;
 
@@ -476,14 +480,17 @@ typedef enum opts_type
   OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 53), // use dynamic shared memory (note: needs special kernel changes)
   OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 54), // some algos use JiT in combinations with a salt or create too much startup time
   OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 55), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
-  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 56), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
-  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 57), // disable else branch in pre-compilation thread count optimization setting
-  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 58), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
+  OPTS_TYPE_THREAD_MULTI_DISABLE              // do not multiply the kernel-power with the thread count per device for super slow algos
+                              = (1ULL << 56),
+  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 57), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 58), // disable else branch in pre-compilation thread count optimization setting
+  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 59), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
   OPTS_TYPE_AUTODETECT_DISABLE
-                              = (1ULL << 59), // skip autodetect engine
-  OPTS_TYPE_STOCK_MODULE      = (1ULL << 60), // module included with hashcat default distribution
+                              = (1ULL << 60), // skip autodetect engine
+  OPTS_TYPE_STOCK_MODULE      = (1ULL << 61), // module included with hashcat default distribution
   OPTS_TYPE_MULTIHASH_DESPITE_ESALT
-                              = (1ULL << 61), // overrule multihash cracking check same salt but not same esalt
+                              = (1ULL << 62), // overrule multihash cracking check same salt but not same esalt
+  OPTS_TYPE_MAXIMUM_ACCEL     = (1ULL << 63)  // try to maximize kernel-accel during autotune
 
 } opts_type_t;
 
@@ -699,7 +706,7 @@ typedef enum user_options_defaults
   #else
   HWMON_TEMP_ABORT         = 90,
   #endif
-  HASH_INFO                = false,
+  HASH_INFO                = 0,
   HASH_MODE                = 0,
   HCCAPX_MESSAGE_PAIR      = 0,
   HEX_CHARSET              = false,
@@ -715,6 +722,7 @@ typedef enum user_options_defaults
   KERNEL_LOOPS             = 0,
   KERNEL_THREADS           = 0,
   KEYSPACE                 = false,
+  TOTAL_CANDIDATES         = false,
   LEFT                     = false,
   LIMIT                    = 0,
   LOGFILE                  = true,
@@ -822,7 +830,7 @@ typedef enum user_options_map
   IDX_DYNAMIC_X                 = 0xff55,
   IDX_ENCODING_FROM             = 0xff15,
   IDX_ENCODING_TO               = 0xff16,
-  IDX_HASH_INFO                 = 0xff17,
+  IDX_HASH_INFO                 = 'H', // 0xff17
   IDX_FORCE                     = 0xff18,
   IDX_HWMON_DISABLE             = 0xff19,
   IDX_HWMON_TEMP_ABORT          = 0xff1a,
@@ -898,6 +906,7 @@ typedef enum user_options_map
   IDX_STATUS_TIMER              = 0xff4c,
   IDX_STDOUT_FLAG               = 0xff4d,
   IDX_STDIN_TIMEOUT_ABORT       = 0xff4e,
+  IDX_TOTAL_CANDIDATES          = 0xff58,
   IDX_TRUECRYPT_KEYFILES        = 0xff4f,
   IDX_USERNAME                  = 0xff50,
   IDX_VERACRYPT_KEYFILES        = 0xff51,
@@ -1510,6 +1519,7 @@ typedef struct hc_device_param
   bool    has_lop3;
   bool    has_mov64;
   bool    has_prmt;
+  bool    has_shfw;
 
   double  spin_damp;
 
@@ -2424,13 +2434,13 @@ typedef struct user_options
   bool         deprecated_check;
   bool         dynamic_x;
   bool         hwmon;
-  bool         hash_info;
   bool         hex_charset;
   bool         hex_salt;
   bool         hex_wordlist;
   bool         increment;
   bool         keep_guessing;
   bool         keyspace;
+  bool         total_candidates;
   bool         left;
   bool         logfile;
   bool         loopback;
@@ -2517,6 +2527,7 @@ typedef struct user_options
   #endif
   u32          debug_mode;
   u32          hwmon_temp_abort;
+  u32          hash_info;
   int          hash_mode;
   u32          hccapx_message_pair;
   u32          hook_threads;
diff --git a/src/Makefile b/src/Makefile
index 2ed479330..2cfbf9406 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -358,7 +358,7 @@ LFLAGS_NATIVE           += -lpthread
 endif # NetBSD
 
 ifeq ($(UNAME),Darwin)
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=15.0
 CFLAGS_NATIVE           := $(CFLAGS)
 CFLAGS_NATIVE           += -DWITH_HWMON
 
diff --git a/src/autotune.c b/src/autotune.c
index df6d5529f..e8f704599 100644
--- a/src/autotune.c
+++ b/src/autotune.c
@@ -43,7 +43,8 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
   device_param->kernel_param.loop_cnt = kernel_loops; // not a bug, both need to be set
   device_param->kernel_param.il_cnt   = kernel_loops; // because there's two variables for inner iters for slow and fast hashes
 
-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads);
 
   u32 kernel_power_try = hardware_power * kernel_accel;
 
@@ -133,7 +134,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     device_param->kernel_accel   = kernel_accel_min;
     device_param->kernel_loops   = kernel_loops_min;
     device_param->kernel_threads = kernel_threads_min;
-    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_min;
+    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_min);
     device_param->kernel_power   = device_param->hardware_power * kernel_accel_min;
   }
 
@@ -212,7 +214,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     // from here it's clear we are allowed to autotune
     // so let's init some fake words
 
-    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_max;
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_max);
 
     u32 kernel_power_max = hardware_power_max * kernel_accel_max;
 
@@ -265,12 +268,12 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -298,13 +301,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     {
       const u32 kernel_threads_sav = device_param->kernel_threads;
 
-      device_param->kernel_threads = device_param->kernel_wgs1;
+      device_param->kernel_threads = MIN (device_param->kernel_wgs1, kernel_threads_max);
 
       run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0, true);
 
       if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
       {
-        device_param->kernel_threads = device_param->kernel_wgs2p;
+        device_param->kernel_threads = MIN (device_param->kernel_wgs2p, kernel_threads_max);
 
         run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0, true);
       }
@@ -329,36 +332,88 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     }
 
     // v7 autotuner is a lot more straight forward
+    // we start with some purely theoretical values as a base, then move on to some meassured tests
 
-    u32 kernel_loops_min_start = kernel_loops_min;
-
-    if (hashes && hashes->st_salts_buf)
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
     {
-      u32 start = kernel_loops_max;
-
-      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter));
-      start = MIN (start, smallest_repeat_double (hashes->st_salts_buf->salt_iter + 1));
-
-      if ((hashes->st_salts_buf->salt_iter     % 125) == 0) start = MIN (start, 125);
-      if ((hashes->st_salts_buf->salt_iter + 1 % 125) == 0) start = MIN (start, 125);
-
-      if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+      if (kernel_accel_min < kernel_accel_max)
       {
-        kernel_loops_min_start = start;
+        // let's also do some minimal accel, this is only to improve early meassurements taken with try_run()
+
+        const u32 kernel_accel_start = previous_power_of_two (kernel_accel_max / 8);
+
+        if ((kernel_accel_start >= kernel_accel_min) && (kernel_accel_start <= kernel_accel_max))
+        {
+          kernel_accel = kernel_accel_start;
+        }
       }
     }
 
-    for (u32 kernel_loops_test = kernel_loops_min_start; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    if (kernel_threads_min < kernel_threads_max)
     {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_test, kernel_threads_min, 2);
+      // there could be a situation, like in 18600, where we have a thread_min which is not a multiple of
+      // kernel_preferred_wgs_multiple. As long as it's only a threads_min, but not a threads_max, we
+      // should stick to at least kernel_preferred_wgs_multiple
 
-      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops_test, kernel_threads_min);
+      if (kernel_threads_min % device_param->kernel_preferred_wgs_multiple)
+      {
+        if ((device_param->kernel_preferred_wgs_multiple >= kernel_threads_min) && (device_param->kernel_preferred_wgs_multiple <= kernel_threads_max))
+        {
+          kernel_threads = device_param->kernel_preferred_wgs_multiple;
+        }
+      }
+    }
+
+    if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+    {
+      if (hashes && hashes->salts_buf)
+      {
+        u32 start = kernel_loops_max;
+
+        const u32 salt_iter = hashes->salts_buf->salt_iter; // we use the first salt as reference
+
+        if (salt_iter)
+        {
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter));
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter + 1));
+
+          if (((hashes->salts_buf->salt_iter + 0) % 125) == 0) start = MIN (start, 125);
+          if (((hashes->salts_buf->salt_iter + 1) % 125) == 0) start = MIN (start, 125);
+
+          if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+          {
+            kernel_loops = start;
+          }
+        }
+        else
+        {
+          // how can there be a slow hash with no iterations?
+        }
+      }
+    }
+    else
+    {
+      // let's also do some minimal loops, this is only to improve early meassurements taken with try_run()
+
+      const u32 kernel_loops_start = previous_power_of_two (kernel_loops_max / 4);
+
+      if ((kernel_loops_start >= kernel_loops_min) && (kernel_loops_start <= kernel_loops_max))
+      {
+        kernel_loops = kernel_loops_start;
+      }
+    }
+
+    for (u32 kernel_loops_test = kernel_loops; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    {
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops_test, kernel_threads, 2);
+
+      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops_test, kernel_threads);
       if (exec_msec > target_msec) break;
 
       // we want a little room for threads to play with so not full target_msec
       // but of course only if we are going to make use of that :)
 
-      if ((kernel_accel_min < kernel_accel_max) || (kernel_threads_min < kernel_threads_max))
+      if ((kernel_accel < kernel_accel_max) || (kernel_threads < kernel_threads_max))
       {
         if (exec_msec > target_msec / 8) break;
 
@@ -372,21 +427,46 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       kernel_loops = kernel_loops_test;
     }
 
-    for (u32 kernel_threads_test = kernel_threads_min; kernel_threads_test <= kernel_threads_max; kernel_threads_test <<= 1)
-    {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads_test, 2);
+    double exec_msec_init = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
 
-      //printf ("threads %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops, kernel_threads_test);
+    float threads_eff_best = exec_msec_init / kernel_threads;
+    u32   threads_cnt_best = kernel_threads;
+
+    float threads_eff_prev = 0;
+    u32   threads_cnt_prev = 0;
+
+    for (u32 kernel_threads_test = kernel_threads; kernel_threads_test <= kernel_threads_max; kernel_threads_test = (kernel_threads_test < device_param->kernel_preferred_wgs_multiple) ? kernel_threads_test << 1 : kernel_threads_test + device_param->kernel_preferred_wgs_multiple)
+    {
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads_test, 2);
+
+      //printf ("thread %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops, kernel_threads_test);
       if (exec_msec > target_msec) break;
 
       if (kernel_threads >= 32)
       {
         // we want a little room for accel to play with so not full target_msec
 
-        if (exec_msec > target_msec / 8) break;
+        if (exec_msec > target_msec / 4) break;
       }
 
       kernel_threads = kernel_threads_test;
+
+      threads_eff_prev = exec_msec / kernel_threads_test;
+      threads_cnt_prev = kernel_threads_test;
+
+      //printf ("%f\n", threads_eff_prev);
+
+      if (threads_eff_prev < threads_eff_best)
+      {
+        threads_eff_best = threads_eff_prev;
+        threads_cnt_best = threads_cnt_prev;
+      }
+    }
+
+    // now we decide to choose either maximum or in some extreme cases prefer more efficient ones
+    if ((threads_eff_best * 1.06) < threads_eff_prev)
+    {
+      kernel_threads = threads_cnt_best;
     }
 
     #define STEPS_CNT 12
@@ -559,7 +639,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   device_param->kernel_loops   = kernel_loops;
   device_param->kernel_threads = kernel_threads;
 
-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads);
 
   device_param->hardware_power = hardware_power;
 
@@ -598,7 +679,7 @@ HC_API_CALL void *thread_autotune (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   // check for autotune failure
@@ -614,10 +695,6 @@ HC_API_CALL void *thread_autotune (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   return NULL;
 }
+
diff --git a/src/backend.c b/src/backend.c
index 15e4badbb..e9e7a5572 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -402,7 +402,8 @@ static bool is_same_device_type (const hc_device_param_t *src, const hc_device_p
   }
 
   if (src->device_processors         != dst->device_processors)         return false;
-  if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
+  // clocks can be different, but clocks should have no impact on workload tuning
+  // if (src->device_maxclock_frequency != dst->device_maxclock_frequency) return false;
   if (src->device_maxworkgroup_size  != dst->device_maxworkgroup_size)  return false;
 
   // memory size can be different, depending on which gpu has a monitor connected
@@ -985,16 +986,16 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &pw_idx, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), sizeof (pw_idx_t)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
   }
@@ -1020,14 +1021,14 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
   {
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyDtoHAsync (hashcat_ctx, pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyDtoH (hashcat_ctx, pw->i, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
       if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyDtoHAsync (hashcat_ctx, pw->i, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyDtoH (hashcat_ctx, pw->i, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
       if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
     }
@@ -1058,11 +1059,6 @@ int gidd_to_pw_t (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, c
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return -1;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
-  }
-
   return 0;
 }
 
@@ -1072,7 +1068,7 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, dest, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t)), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, dest, device_param->cuda_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
@@ -1081,13 +1077,11 @@ int copy_pws_idx (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t)), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_idx + (gidd * sizeof (pw_idx_t)), (cnt * sizeof (pw_idx_t))) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -1112,7 +1106,7 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
   {
     if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1) return -1;
 
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, dest, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, dest, device_param->cuda_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
 
@@ -1121,13 +1115,11 @@ int copy_pws_comp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return -1;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return -1;
 
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, dest, device_param->hip_d_pws_comp_buf + (off * sizeof (u32)), cnt * sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return -1;
   }
 
   #if defined (__APPLE__)
@@ -1199,12 +1191,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm, device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_tm_c, size_tm) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_tm_c, size_tm, device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_tm_c, size_tm) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -1309,12 +1301,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
     {
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_pws_buf, device_param->cuda_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, device_param->hip_d_pws_amp_buf, pws_cnt * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_pws_buf, device_param->hip_d_pws_amp_buf, pws_cnt * sizeof (pw_t)) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -1373,14 +1365,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
           if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
           if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
         }
@@ -1432,12 +1424,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -1534,14 +1526,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1563,14 +1555,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1625,14 +1617,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, pws_cnt * hashconfig->hook_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1684,12 +1676,12 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, pws_cnt * hashconfig->hook_size) == -1) return -1;
             }
 
             #if defined (__APPLE__)
@@ -1774,14 +1766,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -1803,14 +1795,14 @@ int choose_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, pws_cnt * hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -2059,12 +2051,12 @@ int run_cuda_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
   return 0;
 }
 
-int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u8 value, const u64 size)
+int run_cuda_kernel_memset (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u8 value, const u64 size)
 {
-  return hc_cuMemsetD8Async (hashcat_ctx, buf + offset, value, size, device_param->cuda_stream);
+  return hc_cuMemsetD8 (hashcat_ctx, buf + offset, value, size);
 }
 
-int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u32 value, const u64 size)
+int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, CUdeviceptr buf, const u64 offset, const u32 value, const u64 size)
 {
   /* check that the size is multiple of element size */
   if (size % 4 != 0)
@@ -2072,7 +2064,7 @@ int run_cuda_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *dev
     return CUDA_ERROR_INVALID_VALUE;
   }
 
-  return hc_cuMemsetD32Async (hashcat_ctx, buf + offset, value, size / 4, device_param->cuda_stream);
+  return hc_cuMemsetD32 (hashcat_ctx, buf + offset, value, size / 4);
 }
 
 int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, CUdeviceptr buf, const u64 size)
@@ -2096,7 +2088,7 @@ int run_cuda_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
 
   if (num16m)
   {
-    if (hc_cuMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), bzeros, num16m, device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), bzeros, num16m) == -1) return -1;
   }
 
   return 0;
@@ -2138,12 +2130,12 @@ int run_hip_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
   return 0;
 }
 
-int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u8  value, const u64 size)
+int run_hip_kernel_memset (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u8  value, const u64 size)
 {
-  return hc_hipMemsetD8Async (hashcat_ctx, buf + offset, value, size, device_param->hip_stream);
+  return hc_hipMemsetD8 (hashcat_ctx, buf + offset, value, size);
 }
 
-int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u32 value, const u64 size)
+int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 offset, const u32 value, const u64 size)
 {
   /* check that the size is multiple of element size */
   if (size % 4 != 0)
@@ -2151,7 +2143,7 @@ int run_hip_kernel_memset32 (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
     return hipErrorInvalidValue;
   }
 
-  return hc_hipMemsetD32Async (hashcat_ctx, buf + offset, value, size / 4, device_param->hip_stream);
+  return hc_hipMemsetD32 (hashcat_ctx, buf + offset, value, size / 4);
 }
 
 int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, hipDeviceptr_t buf, const u64 size)
@@ -2175,7 +2167,7 @@ int run_hip_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_
 
   if (num16m)
   {
-    if (hc_hipMemcpyHtoDAsync (hashcat_ctx, buf + (num16d * 16), bzeros, num16m, device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyHtoD (hashcat_ctx, buf + (num16d * 16), bzeros, num16m) == -1) return -1;
   }
 
   return 0;
@@ -2205,7 +2197,7 @@ int run_metal_kernel_atinit (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2233,7 +2225,7 @@ int run_metal_kernel_utf8toutf16le (hashcat_ctx_t *hashcat_ctx, hc_device_param_
 
   double ms = 0;
 
-  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+  if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
 
   return 0;
 }
@@ -2264,7 +2256,7 @@ int run_metal_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devic
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
 
   if (num16m)
@@ -2466,7 +2458,7 @@ int run_opencl_kernel_bzero (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi
     }
     else
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_FALSE, num16d * 16, num16m, bzeros, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, buf, CL_TRUE, num16d * 16, num16m, bzeros, 0, NULL, NULL) == -1) return -1;
     }
   }
 
@@ -2592,13 +2584,16 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       case KERN_RUN_AUX4:   cuda_function = device_param->cuda_function_aux4;   break;
     }
 
-    if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params, device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params) == -1) return -1;
 
     if (hc_cuFuncSetAttribute (hashcat_ctx, cuda_function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = CEILDIV (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2636,14 +2631,29 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
+    u32 gridDimX = num_elements;
+    u32 gridDimY = 1;
+    u32 gridDimZ = 1;
+
+    u32 blockDimX = kernel_threads;
+    u32 blockDimY = 1;
+    u32 blockDimZ = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+
     if (is_autotune == true)
     {
-      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+      if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
     }
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event1, device_param->cuda_stream) == -1) return -1;
 
-    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
+    if (hc_cuLaunchKernel (hashcat_ctx, cuda_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->cuda_stream, device_param->kernel_params, NULL) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event2, device_param->cuda_stream) == -1) return -1;
 
@@ -2693,13 +2703,16 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       case KERN_RUN_AUX4:   hip_function = device_param->hip_function_aux4;   break;
     }
 
-    if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params, device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_kernel_param, &device_param->kernel_param, device_param->size_kernel_params) == -1) return -1;
 
     //if (hc_hipFuncSetAttribute (hashcat_ctx, hip_function, HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, dynamic_shared_mem) == -1) return -1;
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = CEILDIV (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = CEILDIV (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2737,14 +2750,31 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
+    u32 gridDimX = num_elements;
+    u32 gridDimY = 1;
+    u32 gridDimZ = 1;
+
+    u32 blockDimX = kernel_threads;
+    u32 blockDimY = 1;
+    u32 blockDimZ = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+      blockDimY = hashcat_ctx->hashes->salts_buf->salt_dimy;
+
+    //printf ("%d %d %d %d %d %d %d\n", kern_run, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ);
+
     if (is_autotune == true)
     {
-      if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
+      if (hc_hipLaunchKernel (hashcat_ctx, hip_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
     }
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event1, device_param->hip_stream) == -1) return -1;
 
-    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, num_elements, 1, 1, kernel_threads, 1, 1, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
+    if (hc_hipLaunchKernel (hashcat_ctx, hip_function, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, dynamic_shared_mem, device_param->hip_stream, device_param->kernel_params, NULL) == -1) return -1;
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event2, device_param->hip_stream) == -1) return -1;
 
@@ -2821,7 +2851,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
 
     if (kernel_threads == 0) kernel_threads = 1;
 
-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }
 
     if (kern_run == KERN_RUN_1)
     {
@@ -2859,16 +2892,43 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_32 (num_elements, kernel_threads);
+    }
+    else
+    {
+      num_elements = num_elements * kernel_threads;
+    }
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    unsigned int work_dim = 1;
+
+    size_t global_work_size[3] = { num_elements,   1, 1 };
+    size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
 
     double ms = 0;
 
     if (is_autotune == true)
     {
-      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+      hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);
 
       // hc_mtlEncodeComputeCommand_pre() must be called before every hc_mtlEncodeComputeCommand()
       if (hc_mtlEncodeComputeCommand_pre (hashcat_ctx, metal_pipeline, device_param->metal_command_queue, &metal_command_buffer, &metal_command_encoder) == -1) return -1;
@@ -2887,7 +2947,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, work_dim, global_work_size, local_work_size, &ms);
 
     if (rc_cc != -1)
     {
@@ -2944,7 +3004,7 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       if (hc_clSetKernelArg (hashcat_ctx, opencl_kernel, i, sizeof (cl_mem), device_param->kernel_params[i]) == -1) return -1;
     }
 
-    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_kernel_param, CL_FALSE, 0, device_param->size_kernel_params, &device_param->kernel_param, 0, NULL, NULL) == -1) return -1;
+    if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_kernel_param, CL_TRUE, 0, device_param->size_kernel_params, &device_param->kernel_param, 0, NULL, NULL) == -1) return -1;
 
     /*
     for (u32 i = 24; i <= 34; i++)
@@ -2958,7 +3018,10 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
     }
     */
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    }
 
     cl_event opencl_event;
 
@@ -2984,17 +3047,44 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con
       }
     }
 
-    num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+    {
+      num_elements = round_up_multiple_64 (num_elements, kernel_threads);
+    }
+    else
+    {
+      num_elements = num_elements * kernel_threads;
+    }
 
-    const size_t global_work_size[3] = { num_elements,   1, 1 };
-    const size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+    size_t global_work_size[3] = { num_elements,   1, 1 };
+    size_t local_work_size[3]  = { kernel_threads, 1, 1 };
+
+    cl_uint work_dim = 1;
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_INIT) && (kern_run == KERN_RUN_1))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_LOOP) && (kern_run == KERN_RUN_2))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
+
+    if ((hashconfig->opti_type & OPTI_TYPE_SLOW_HASH_DIMY_COMP) && (kern_run == KERN_RUN_3))
+    {
+      global_work_size[1] = local_work_size[1] = hashcat_ctx->hashes->salts_buf->salt_dimy;
+      work_dim = 2;
+    }
 
     if (is_autotune == true)
     {
-      if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
+      if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
     }
 
-    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
+    if (hc_clEnqueueNDRangeKernel (hashcat_ctx, device_param->opencl_command_queue, opencl_kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &opencl_event) == -1) return -1;
 
     // spin damper section
 
@@ -3250,7 +3340,7 @@ int run_kernel_mp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3341,7 +3431,7 @@ int run_kernel_tm (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3425,7 +3515,7 @@ int run_kernel_amp (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param,
 
     double ms = 0;
 
-    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms);
+    const int rc_cc = hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms);
 
     // release tmp_buf
 
@@ -3505,7 +3595,7 @@ int run_kernel_decompress (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device
 
     double ms = 0;
 
-    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, global_work_size[0], local_work_size[0], &ms) == -1) return -1;
+    if (hc_mtlEncodeComputeCommand (hashcat_ctx, metal_command_encoder, metal_command_buffer, 1, global_work_size, local_work_size, &ms) == -1) return -1;
   }
   #endif // __APPLE__
 
@@ -3551,7 +3641,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
   {
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3559,13 +3649,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
       }
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3573,7 +3663,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
       }
     }
 
@@ -3595,7 +3685,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
     if (device_param->is_opencl == true)
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
       const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3603,7 +3693,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (off)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
       }
     }
 
@@ -3615,7 +3705,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
     {
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3623,13 +3713,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
         }
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3637,7 +3727,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
         }
       }
 
@@ -3659,7 +3749,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
         const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3667,7 +3757,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (off)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
         }
       }
 
@@ -3713,7 +3803,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3721,13 +3811,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
           }
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3735,7 +3825,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
           }
         }
 
@@ -3757,7 +3847,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
           const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3765,7 +3855,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (off)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
           }
         }
 
@@ -3777,7 +3867,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3785,13 +3875,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3799,7 +3889,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
@@ -3821,7 +3911,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3829,7 +3919,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
             }
           }
 
@@ -3839,7 +3929,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3847,13 +3937,13 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_idx, device_param->pws_idx, pws_cnt * sizeof (pw_idx_t)) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3861,7 +3951,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32), device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_comp_buf, device_param->pws_comp, off * sizeof (u32)) == -1) return -1;
             }
           }
 
@@ -3883,7 +3973,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_FALSE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_idx, CL_TRUE, 0, pws_cnt * sizeof (pw_idx_t), device_param->pws_idx, 0, NULL, NULL) == -1) return -1;
 
             const pw_idx_t *pw_idx = device_param->pws_idx + pws_cnt;
 
@@ -3891,7 +3981,7 @@ int run_copy (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const
 
             if (off)
             {
-              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_FALSE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
+              if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_comp_buf, CL_TRUE, 0, off * sizeof (u32), device_param->pws_comp, 0, NULL, NULL) == -1) return -1;
             }
           }
 
@@ -4125,12 +4215,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
         {
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules + (innerloop_pos * sizeof (kernel_rule_t)), innerloop_left * sizeof (kernel_rule_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -4257,12 +4347,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4274,7 +4364,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_opencl == true)
               {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
               }
             }
             else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
@@ -4287,12 +4377,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4317,12 +4407,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4450,12 +4540,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->combs_buf, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4467,7 +4557,7 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_opencl == true)
               {
-                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
+                if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, innerloop_left * sizeof (pw_t), device_param->combs_buf, 0, NULL, NULL) == -1) return -1;
               }
             }
             else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
@@ -4480,12 +4570,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_combs_c, device_param->cuda_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_combs_c, device_param->hip_d_combs, innerloop_left * sizeof (pw_t)) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -4512,12 +4602,12 @@ int run_cracker (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, co
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_bfs_c, device_param->cuda_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_bfs, innerloop_left * sizeof (bf_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_bfs_c, device_param->hip_d_bfs, innerloop_left * sizeof (bf_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -4696,8 +4786,8 @@ int backend_ctx_init (hashcat_ctx_t *hashcat_ctx)
   backend_ctx->enabled = false;
 
   if (user_options->usage      > 0)    return 0;
+  if (user_options->hash_info  > 0)    return 0;
 
-  if (user_options->hash_info == true) return 0;
   if (user_options->keyspace  == true) return 0;
   if (user_options->left      == true) return 0;
   if (user_options->show      == true) return 0;
@@ -5418,37 +5508,18 @@ void backend_ctx_destroy (hashcat_ctx_t *hashcat_ctx)
   memset (backend_ctx, 0, sizeof (backend_ctx_t));
 }
 
-int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
 {
-  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
-  const folder_config_t *folder_config = hashcat_ctx->folder_config;
-        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
-        user_options_t  *user_options  = hashcat_ctx->user_options;
+  const bridge_ctx_t   *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t  *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t *user_options  = hashcat_ctx->user_options;
 
-  if (backend_ctx->enabled == false) return 0;
-
-  hc_device_param_t *devices_param = backend_ctx->devices_param;
-
-  bool need_adl           = false;
-  bool need_nvml          = false;
-  bool need_nvapi         = false;
-  bool need_sysfs_amdgpu  = false;
-  bool need_sysfs_cpu     = false;
-  bool need_iokit         = false;
-
-  int bridge_link_device = 0; // this will only count active device
-
-  int backend_devices_idx = 0; // this will not only count active devices
+  hc_device_param_t    *devices_param = backend_ctx->devices_param;
 
   bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
 
   int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
 
-  int virthost = -1;
-  int virthost_finder = user_options->backend_devices_virthost;
-
-  // CUDA
-
   int cuda_devices_cnt    = 0;
   int cuda_devices_active = 0;
 
@@ -5463,15 +5534,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= cuda_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= cuda_devices_cnt))
       {
         cuda_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= cuda_devices_cnt;
+        *virthost_finder -= cuda_devices_cnt;
 
         cuda_devices_cnt = 0;
       }
@@ -5481,17 +5552,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, backend_devices_idx++)
+    for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 cuda_devices_idx_real = (is_virtualized == true) ? virthost : cuda_devices_idx;
+      const u32 cuda_devices_idx_real = (is_virtualized == true) ? *virthost : cuda_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_cuda[cuda_devices_idx] = *backend_devices_idx;
 
       CUdevice cuda_device;
 
@@ -5791,10 +5862,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
       {
-        need_nvml = true;
+        backend_ctx->need_nvml = true;
 
         #if defined (_WIN) || defined (__CYGWIN__)
-        need_nvapi = true;
+        backend_ctx->need_nvapi = true;
         #endif
       }
 
@@ -5855,25 +5926,26 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = (sm >= 50) ? true : false;
       device_param->has_mov64 = (sm >= 10) ? true : false;
       device_param->has_prmt  = (sm >= 20) ? true : false;
+      device_param->has_shfw  = (sm >= 70) ? true : false;
+
+      // one-time init cuda context
+
+      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
 
       // device_available_mem
 
-      CUcontext cuda_context;
-
-      if (hc_cuCtxCreate (hashcat_ctx, &cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -5886,14 +5958,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_cuCtxPopCurrent (hashcat_ctx, &cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxDestroy (hashcat_ctx, cuda_context) == -1)
+      if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1)
       {
         device_param->skipped = true;
 
@@ -5906,7 +5971,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         cuda_devices_active++;
       }
@@ -5915,8 +5980,22 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->cuda_devices_cnt     = cuda_devices_cnt;
   backend_ctx->cuda_devices_active  = cuda_devices_active;
+}
 
-  // HIP
+static void backend_ctx_devices_init_hip (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
+{
+  #if defined (__linux__)
+  const folder_config_t *folder_config = hashcat_ctx->folder_config;
+  #endif
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
 
   int hip_devices_cnt    = 0;
   int hip_devices_active = 0;
@@ -5932,15 +6011,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= hip_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= hip_devices_cnt))
       {
         hip_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= hip_devices_cnt;
+        *virthost_finder -= hip_devices_cnt;
 
         hip_devices_cnt = 0;
       }
@@ -5950,17 +6029,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++, backend_devices_idx++)
+    for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 hip_devices_idx_real = (is_virtualized == true) ? virthost : hip_devices_idx;
+      const u32 hip_devices_idx_real = (is_virtualized == true) ? *virthost : hip_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_hip[hip_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_hip[hip_devices_idx] = *backend_devices_idx;
 
       hipDevice_t hip_device;
 
@@ -6027,6 +6106,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       device_param->device_processors = device_processors;
 
+      if ((device_param->device_processors == 1) && (device_param->device_host_unified_memory == 1))
+      {
+        // APUs return some weird numbers. These values seem more appropriate (from rocminfo)
+        //Compute Unit:            2
+        //SIMDs per CU:            2
+        //Wavefront Size:          32(0x20)
+        //Max Waves Per CU:        32(0x20)
+
+        device_param->device_processors = 2 * 32;
+      }
+
       // device_global_mem, device_maxmem_alloc, device_available_mem
 
       size_t bytes = 0;
@@ -6275,10 +6365,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD_USE_HIP) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
       {
-         need_adl = true;
+         backend_ctx->need_adl = true;
 
          #if defined (__linux__)
-         need_sysfs_amdgpu = true;
+         backend_ctx->need_sysfs_amdgpu = true;
          #endif
       }
 
@@ -6332,25 +6422,26 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
+      device_param->has_shfw  = true; // always reports false : prop.arch.hasFunnelShift;
+
+      // one-time init hip context
+
+      if (hc_hipSetDeviceFlags (hashcat_ctx, hipDeviceScheduleBlockingSync) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
 
       // device_available_mem
 
-      hipCtx_t hip_context;
-
-      if (hc_hipCtxCreate (hashcat_ctx, &hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       size_t free  = 0;
       size_t total = 0;
 
@@ -6363,20 +6454,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       device_param->device_available_mem = ((u64) free * (100 - user_options->backend_devices_keepfree)) / 100;
 
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxDestroy (hashcat_ctx, hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
       #if defined (__linux__)
       if (strchr (folder_config->cpath_real, ' ') != NULL)
       {
@@ -6404,7 +6481,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         hip_devices_active++;
       }
@@ -6413,13 +6490,25 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->hip_devices_cnt     = hip_devices_cnt;
   backend_ctx->hip_devices_active  = hip_devices_active;
+}
 
-  // Metal
+static void backend_ctx_devices_init_metal (hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED int *virthost, MAYBE_UNUSED int *virthost_finder, MAYBE_UNUSED int *backend_devices_idx, MAYBE_UNUSED int *bridge_link_device)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
 
   int metal_devices_cnt    = 0;
   int metal_devices_active = 0;
 
   #if defined (__APPLE__)
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
+
   if (backend_ctx->mtl)
   {
     // device count
@@ -6431,15 +6520,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     if (is_virtualized == true)
     {
-      if ((virthost == -1) && (virthost_finder <= metal_devices_cnt))
+      if ((*virthost == -1) && (*virthost_finder <= metal_devices_cnt))
       {
         metal_devices_cnt = virtmulti;
 
-        virthost = virthost_finder - 1;
+        *virthost = *virthost_finder - 1;
       }
       else
       {
-        virthost_finder -= metal_devices_cnt;
+        *virthost_finder -= metal_devices_cnt;
 
         metal_devices_cnt = 0;
       }
@@ -6449,17 +6538,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
     // device specific
 
-    for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++, backend_devices_idx++)
+    for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++, (*backend_devices_idx)++)
     {
-      const u32 device_id = backend_devices_idx;
+      const u32 device_id = *backend_devices_idx;
 
-      const u32 metal_devices_idx_real = (is_virtualized == true) ? virthost : metal_devices_idx;
+      const u32 metal_devices_idx_real = (is_virtualized == true) ? *virthost : metal_devices_idx;
 
-      hc_device_param_t *device_param = &devices_param[backend_devices_idx];
+      hc_device_param_t *device_param = &devices_param[*backend_devices_idx];
 
       device_param->device_id = device_id;
 
-      backend_ctx->backend_device_from_metal[metal_devices_idx] = backend_devices_idx;
+      backend_ctx->backend_device_from_metal[metal_devices_idx] = *backend_devices_idx;
 
       mtl_device_id metal_device = NULL;
 
@@ -6789,7 +6878,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_APPLE))
       {
-        need_iokit = true;
+        backend_ctx->need_iokit = true;
       }
 
       // CPU burning loop damper
@@ -6820,6 +6909,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = false;
       device_param->has_mov64 = false;
       device_param->has_prmt  = false;
+      device_param->has_shfw  = false;
 
       // check if we need skip device
 
@@ -6831,7 +6921,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (device_param->skipped == false)
       {
-        device_param->bridge_link_device = bridge_link_device++;
+        device_param->bridge_link_device = (*bridge_link_device)++;
 
         metal_devices_active++;
       }
@@ -6841,8 +6931,20 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->metal_devices_cnt     = metal_devices_cnt;
   backend_ctx->metal_devices_active  = metal_devices_active;
+}
 
-  // OCL
+static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, int *virthost, int *virthost_finder, int *backend_devices_idx, int *bridge_link_device)
+{
+  const folder_config_t *folder_config = hashcat_ctx->folder_config;
+  const bridge_ctx_t    *bridge_ctx    = hashcat_ctx->bridge_ctx;
+        backend_ctx_t   *backend_ctx   = hashcat_ctx->backend_ctx;
+        user_options_t  *user_options  = hashcat_ctx->user_options;
+
+  hc_device_param_t     *devices_param = backend_ctx->devices_param;
+
+  bool is_virtualized = ((user_options->backend_devices_virtmulti > 1) || (bridge_ctx->enabled == true)) ? true : false;
+
+  int virtmulti = (bridge_ctx->enabled == true) ? bridge_ctx->get_unit_count (bridge_ctx->platform_context) : (int) user_options->backend_devices_virtmulti;
 
   int opencl_devices_cnt    = 0;
   int opencl_devices_active = 0;
@@ -6868,15 +6970,15 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
       if (is_virtualized == true)
       {
-        if ((virthost == -1) && (virthost_finder <= (int) opencl_platform_devices_cnt))
+        if ((*virthost == -1) && (*virthost_finder <= (int) opencl_platform_devices_cnt))
         {
           opencl_platform_devices_cnt = virtmulti;
 
-          virthost = virthost_finder - 1;
+          *virthost = *virthost_finder - 1;
         }
         else
         {
-          virthost_finder -= (int) opencl_platform_devices_cnt;
+          *virthost_finder -= (int) opencl_platform_devices_cnt;
 
           opencl_platform_devices_cnt = 0;
         }
@@ -6884,21 +6986,21 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         opencl_platforms_devices_cnt[opencl_platforms_idx] = opencl_platform_devices_cnt;
       }
 
-      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, backend_devices_idx++, opencl_devices_cnt++)
+      for (u32 opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++, (*backend_devices_idx)++, opencl_devices_cnt++)
       {
-        const u32 device_id = backend_devices_idx;
+        const u32 device_id = *backend_devices_idx;
 
         hc_device_param_t *device_param = &devices_param[device_id];
 
         device_param->device_id = device_id;
 
-        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = backend_devices_idx;
+        backend_ctx->backend_device_from_opencl[opencl_devices_cnt] = *backend_devices_idx;
 
-        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = backend_devices_idx;
+        backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx] = *backend_devices_idx;
 
         device_param->opencl_platform_vendor_id = opencl_platform_vendor_id;
 
-        device_param->opencl_device = opencl_platform_devices[(is_virtualized == true) ? virthost : (int) opencl_platform_devices_idx];
+        device_param->opencl_device = opencl_platform_devices[(is_virtualized == true) ? *virthost : (int) opencl_platform_devices_idx];
 
         //device_param->opencl_platform = opencl_platform;
 
@@ -7149,6 +7251,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->opencl_device_c_version = opencl_device_c_version;
 
+        // device_host_unified_memory
+
+        cl_bool device_host_unified_memory = false;
+
+        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof (device_host_unified_memory), &device_host_unified_memory, NULL) == -1)
+        {
+          device_param->skipped = true;
+
+          continue;
+        }
+
+        device_param->device_host_unified_memory = (device_host_unified_memory == CL_TRUE) ? 1 : 0;
+
         // max_compute_units
 
         cl_uint device_processors = 0;
@@ -7162,6 +7277,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
         device_param->device_processors = device_processors;
 
+        if ((device_param->device_processors == 1) && (device_param->device_host_unified_memory == 1))
+        {
+          // APUs return some weird numbers. These values seem more appropriate (from rocminfo)
+          //Compute Unit:            2
+          //SIMDs per CU:            2
+          //Wavefront Size:          32(0x20)
+          //Max Waves Per CU:        32(0x20)
+
+          device_param->device_processors = 2 * 32;
+        }
+
         #if defined (__APPLE__)
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
@@ -7189,19 +7315,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         }
         #endif // __APPLE__
 
-        // device_host_unified_memory
-
-        cl_bool device_host_unified_memory = false;
-
-        if (hc_clGetDeviceInfo (hashcat_ctx, device_param->opencl_device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof (device_host_unified_memory), &device_host_unified_memory, NULL) == -1)
-        {
-          device_param->skipped = true;
-
-          continue;
-        }
-
-        device_param->device_host_unified_memory = (device_host_unified_memory == CL_TRUE) ? 1 : 0;
-
         // device_global_mem
 
         cl_ulong device_global_mem = 0;
@@ -7688,12 +7801,12 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           #if defined (__APPLE__)
           if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
           {
-            need_iokit = true;
+            backend_ctx->need_iokit = true;
           }
           #endif
 
           #if defined (__linux__)
-          need_sysfs_cpu = true;
+          backend_ctx->need_sysfs_cpu = true;
           #endif
         }
 
@@ -7701,19 +7814,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         {
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
-            need_adl = true;
+            backend_ctx->need_adl = true;
 
             #if defined (__linux__)
-            need_sysfs_amdgpu = true;
+            backend_ctx->need_sysfs_amdgpu = true;
             #endif
           }
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
           {
-            need_nvml = true;
+            backend_ctx->need_nvml = true;
 
             #if defined (_WIN) || defined (__CYGWIN__)
-            need_nvapi = true;
+            backend_ctx->need_nvapi = true;
             #endif
           }
 
@@ -7722,7 +7835,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           {
             if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE)
             {
-              need_iokit = true;
+              backend_ctx->need_iokit = true;
             }
           }
           #endif
@@ -7968,6 +8081,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           device_param->has_lop3  = (sm >= 50) ? true : false;
           device_param->has_mov64 = (sm >= 10) ? true : false;
           device_param->has_prmt  = (sm >= 20) ? true : false;
+          device_param->has_shfw  = (sm >= 70) ? true : false;
         }
 
         // common driver check
@@ -8231,7 +8345,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
            * activate device
            */
 
-          device_param->bridge_link_device = bridge_link_device++;
+          device_param->bridge_link_device = (*bridge_link_device)++;
 
           opencl_devices_active++;
         }
@@ -8241,11 +8355,51 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->opencl_devices_cnt     = opencl_devices_cnt;
   backend_ctx->opencl_devices_active  = opencl_devices_active;
+}
+
+int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+
+  if (backend_ctx->enabled == false) return 0;
+
+  user_options_t    *user_options  = hashcat_ctx->user_options;
+  hc_device_param_t *devices_param = backend_ctx->devices_param;
+
+  backend_ctx->need_adl           = false;
+  backend_ctx->need_nvml          = false;
+  backend_ctx->need_nvapi         = false;
+  backend_ctx->need_sysfs_amdgpu  = false;
+  backend_ctx->need_sysfs_cpu     = false;
+  backend_ctx->need_iokit         = false;
+
+  int bridge_link_device = 0; // this will only count active device
+
+  int backend_devices_idx = 0; // this will not only count active devices
+
+  int virthost = -1;
+  int virthost_finder = user_options->backend_devices_virthost;
+
+  // CUDA
+
+  backend_ctx_devices_init_cuda (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+
+  // HIP
+
+  backend_ctx_devices_init_hip (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+
+  // Metal
+
+  backend_ctx_devices_init_metal (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
+
+  // OCL
+
+  backend_ctx_devices_init_opencl (hashcat_ctx, &virthost, &virthost_finder, &backend_devices_idx, &bridge_link_device);
 
   // all devices combined go into backend_* variables
 
-  backend_ctx->backend_devices_cnt    = cuda_devices_cnt    + hip_devices_cnt    + metal_devices_cnt    + opencl_devices_cnt;
-  backend_ctx->backend_devices_active = cuda_devices_active + hip_devices_active + metal_devices_active + opencl_devices_active;
+  backend_ctx->backend_devices_cnt    = backend_ctx->cuda_devices_cnt    + backend_ctx->hip_devices_cnt    + backend_ctx->metal_devices_cnt    + backend_ctx->opencl_devices_cnt;
+  backend_ctx->backend_devices_active = backend_ctx->cuda_devices_active + backend_ctx->hip_devices_active + backend_ctx->metal_devices_active + backend_ctx->opencl_devices_active;
 
   #if defined (__APPLE__)
   // disable Metal devices if at least one OpenCL device is enabled
@@ -8363,6 +8517,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       device_param->has_lop3  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                                          \
       device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");  \
       device_param->has_prmt  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                             \
+      device_param->has_shfw  = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"shf.l.wrap.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                                       \
 
     if (backend_devices_idx > 0)
     {
@@ -8378,6 +8533,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         device_param->has_lop3  = device_param_prev->has_lop3;
         device_param->has_mov64 = device_param_prev->has_mov64;
         device_param->has_prmt  = device_param_prev->has_prmt;
+        device_param->has_shfw  = device_param_prev->has_shfw;
       }
       else
       {
@@ -8437,13 +8593,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init metal command-queue
 
-      mtl_command_queue command_queue;
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &command_queue) == -1)
+      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8492,11 +8644,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
           u8 tmp_host[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], 0, tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], 0, sizeof (tmp_host)) == -1) break;
 
-          if (hc_mtlMemcpyHtoD (hashcat_ctx, command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
-          if (hc_mtlMemcpyDtoH (hashcat_ctx, command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyHtoD (hashcat_ctx, device_param->metal_command_queue, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), tmp_host, sizeof (tmp_host)) == -1) break;
+          if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, tmp_host, tmp_device[c], MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host)) == -1) break;
         }
 
         device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -8521,8 +8673,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         hcfree (tmp_device);
       }
 
-      hc_mtlReleaseCommandQueue (hashcat_ctx, command_queue);
-
       if (device_param->device_host_unified_memory == 1)
       {
         // so, we actually have only half the memory because we need the same buffers on host side
@@ -8548,11 +8698,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         if (device_param->skipped == true) continue;
       }
 
-      /**
-       * create context for each device
-       */
-
-      cl_context context;
+      // one-time init opencl context
 
       /*
       cl_context_properties properties[3];
@@ -8561,10 +8707,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       properties[1] = (cl_context_properties) device_param->opencl_platform;
       properties[2] = 0;
 
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &context);
+      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
       */
 
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &context) == -1)
+      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
       {
         device_param->skipped = true;
 
@@ -8574,13 +8720,9 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
         continue;
       }
 
-      /**
-       * create command-queue
-       */
+      // one-time init open command-queue
 
-      cl_command_queue command_queue;
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, context, device_param->opencl_device, 0, &command_queue) == -1)
+      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
       {
         device_param->skipped = true;
 
@@ -8595,17 +8737,17 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
       if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
       {
         #define RUN_INSTRUCTION_CHECKS() \
-          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
-          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
-          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vadd     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vaddc_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADDC_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_U32     %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_U32    %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vsub_co  = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUB_CO_U32  %0, vcc, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vsubb_co = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_SUBB_CO_U32 %0, vcc, 0, 0, vcc;\" : \"=v\"(r1)); }"); \
+          device_param->has_vadd3    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_ADD3_U32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vbfe     = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_BFE_U32     %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
+          device_param->has_vperm    = opencl_test_instruction (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, "__kernel void test () { uint r1; __asm__ __volatile__ (\"V_PERM_B32    %0,   0, 0, 0;\"      : \"=v\"(r1)); }"); \
 
         if (backend_devices_idx > 0)
         {
@@ -8652,6 +8794,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
           device_param->has_lop3  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");                                    \
           device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }"); \
           device_param->has_prmt  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"prmt.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                       \
+          device_param->has_shfw  = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"shf.l.wrap.b32 %0, 0, 0, 0;\" : \"=r\"(r)); }");                                 \
 
         if (backend_devices_idx > 0)
         {
@@ -8667,6 +8810,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
             device_param->has_lop3  = device_param_prev->has_lop3;
             device_param->has_mov64 = device_param_prev->has_mov64;
             device_param->has_prmt  = device_param_prev->has_prmt;
+            device_param->has_shfw  = device_param_prev->has_shfw;
           }
           else
           {
@@ -8791,7 +8935,7 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               OCL_PTR *ocl = (OCL_PTR *) backend_ctx->ocl;
 
-              tmp_device[c] = ocl->clCreateBuffer (context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
+              tmp_device[c] = ocl->clCreateBuffer (device_param->opencl_context, CL_MEM_READ_WRITE, MAX_ALLOC_CHECKS_SIZE, NULL, &CL_err);
 
               if (CL_err != CL_SUCCESS)
               {
@@ -8804,11 +8948,11 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
               u8 tmp_host[8];
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, 0, sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
 
-              if (ocl->clEnqueueReadBuffer  (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
-              if (ocl->clEnqueueWriteBuffer (command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueReadBuffer  (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
+              if (ocl->clEnqueueWriteBuffer (device_param->opencl_command_queue, tmp_device[c], CL_TRUE, MAX_ALLOC_CHECKS_SIZE - sizeof (tmp_host), sizeof (tmp_host), tmp_host, 0, NULL, NULL) != CL_SUCCESS) break;
             }
 
             device_param->device_available_mem = MAX_ALLOC_CHECKS_SIZE;
@@ -8820,24 +8964,28 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
             // clean up
 
+            int r = 0;
+
             for (c = 0; c < MAX_ALLOC_CHECKS_CNT; c++)
             {
               if (((c + 1 + 1) * MAX_ALLOC_CHECKS_SIZE) >= device_param->device_global_mem) break;
 
               if (tmp_device[c] != NULL)
               {
-                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) return -1;
+                if (hc_clReleaseMemObject (hashcat_ctx, tmp_device[c]) == -1) r = -1;
               }
             }
 
             hcfree (tmp_device);
+
+            if (r == -1)
+            {
+              // return -1 here is blocking, to be better evaluated
+              //return -1;
+            }
           }
         }
 
-        hc_clReleaseCommandQueue (hashcat_ctx, command_queue);
-
-        hc_clReleaseContext (hashcat_ctx, context);
-
         if (device_param->device_host_unified_memory == 1)
         {
           // so, we actually have only half the memory because we need the same buffers on host side
@@ -8858,13 +9006,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
 
   backend_ctx->target_msec  = TARGET_MSEC_PROFILE[user_options->workload_profile - 1];
 
-  backend_ctx->need_adl           = need_adl;
-  backend_ctx->need_nvml          = need_nvml;
-  backend_ctx->need_nvapi         = need_nvapi;
-  backend_ctx->need_sysfs_amdgpu  = need_sysfs_amdgpu;
-  backend_ctx->need_sysfs_cpu     = need_sysfs_cpu;
-  backend_ctx->need_iokit         = need_iokit;
-
   backend_ctx->comptime = comptime;
 
   return 0;
@@ -8884,23 +9025,61 @@ void backend_ctx_devices_destroy (hashcat_ctx_t *hashcat_ctx)
     hcfree (backend_ctx->opencl_platforms_version[opencl_platforms_idx]);
   }
 
+  // one-time release context/command-queue from all runtimes
+
   for (int backend_devices_idx = 0; backend_devices_idx < backend_ctx->backend_devices_cnt; backend_devices_idx++)
   {
     hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
 
     hcfree (device_param->device_name);
 
+    if (device_param->is_cuda == true)
+    {
+      if (device_param->cuda_context)
+      {
+        hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+
+        device_param->cuda_context = NULL;
+      }
+    }
+
+    if (device_param->is_hip == true)
+    {
+      hcfree (device_param->gcnArchName);
+    }
+
+    #if defined (__APPLE__)
+    if (device_param->is_metal == true)
+    {
+      if (device_param->metal_command_queue)
+      {
+        hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+
+        device_param->metal_command_queue = NULL;
+      }
+    }
+    #endif
+
     if (device_param->is_opencl == true)
     {
       hcfree (device_param->opencl_driver_version);
       hcfree (device_param->opencl_device_version);
       hcfree (device_param->opencl_device_c_version);
       hcfree (device_param->opencl_device_vendor);
-    }
 
-    if (device_param->is_hip == true)
-    {
-      hcfree (device_param->gcnArchName);
+      if (device_param->opencl_command_queue)
+      {
+        hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+
+        device_param->opencl_command_queue = NULL;
+      }
+
+      if (device_param->opencl_context)
+      {
+        hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+
+        device_param->opencl_context = NULL;
+      }
     }
   }
 
@@ -8952,7 +9131,8 @@ void backend_ctx_devices_sync_tuning (hashcat_ctx_t *hashcat_ctx)
       device_param_dst->kernel_loops   = device_param_src->kernel_loops;
       device_param_dst->kernel_threads = device_param_src->kernel_threads;
 
-      const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param_dst->device_processors) * device_param_dst->kernel_threads;
+      const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param_dst->device_processors)
+                               * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param_dst->kernel_threads);
 
       device_param_dst->hardware_power = hardware_power;
 
@@ -9154,7 +9334,14 @@ static int get_opencl_kernel_wgs (hashcat_ctx_t *hashcat_ctx, hc_device_param_t
 
   if (cwgs_total > 0)
   {
-    kernel_threads = MIN (kernel_threads, (u32) cwgs_total);
+    if (kernel_threads < cwgs_total)
+    {
+      // Very likely some bug, because the runtime was unable to follow our requirement to run N threads guaranteed on this kernel
+
+      event_log_warning (hashcat_ctx, "* Device #%u: Runtime returned CL_KERNEL_WORK_GROUP_SIZE=%d, but CL_KERNEL_COMPILE_WORK_GROUP_SIZE=%d. Use -T%d if you run into problems.", device_param->device_id + 1, (int) kernel_threads, (int) cwgs_total, (int) kernel_threads);
+    }
+
+    kernel_threads = cwgs_total;
   }
 
   *result = kernel_threads;
@@ -9522,7 +9709,11 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
 
       hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "-D MAX_THREADS_PER_BLOCK=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
       hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-architecture=%s", device_param->gcnArchName);
-      hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+
+      if ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) == 0)
+      {
+        hc_asprintf (&hiprtc_options[hiprtc_options_idx++], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max);
+      }
 
       // untested but it should work
       #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
@@ -10024,10 +10215,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           {
             const u64 device_available_mem_sav = device_param->device_available_mem;
 
-            const u64 device_available_mem_new = device_available_mem_sav - (device_available_mem_sav * 0.2);
+            const u64 device_available_mem_new = device_available_mem_sav - (device_available_mem_sav * 0.34);
 
             event_log_warning (hashcat_ctx, "* Device #%u: This system does not offer any reliable method to query actual free memory. Estimated base: %" PRIu64, device_id + 1, device_available_mem_sav);
-            event_log_warning (hashcat_ctx, "             Assuming normal desktop activity, reducing estimate by 20%%: %" PRIu64, device_available_mem_new);
+            event_log_warning (hashcat_ctx, "             Assuming normal desktop activity, reducing estimate by 34%%: %" PRIu64, device_available_mem_new);
             event_log_warning (hashcat_ctx, "             This can hurt performance drastically, especially on memory-heavy algorithms.");
             event_log_warning (hashcat_ctx, "             You can adjust this percentage using --backend-devices-keepfree");
             event_log_warning (hashcat_ctx, NULL);
@@ -10436,7 +10627,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
      * device properties
      */
 
-    const u32 device_processors = device_param->device_processors;
+    //const u32 device_processors = device_param->device_processors;
 
     /**
      * device threads
@@ -10537,93 +10728,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     }
     #endif
 
-    /**
-     * create context for each device
-     */
-
-    if (device_param->is_cuda == true)
-    {
-      if (hc_cuCtxCreate (hashcat_ctx, &device_param->cuda_context, CU_CTX_SCHED_BLOCKING_SYNC, device_param->cuda_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxCreate (hashcat_ctx, &device_param->hip_context, hipDeviceScheduleBlockingSync, device_param->hip_device) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
-    #if defined (__APPLE__)
-    if (device_param->is_metal == true)
-    {
-      /**
-       * create command-queue
-       */
-
-      if (hc_mtlCreateCommandQueue (hashcat_ctx, device_param->metal_device, &device_param->metal_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-    #endif
-
-    if (device_param->is_opencl == true)
-    {
-      /*
-      cl_context_properties properties[3];
-
-      properties[0] = CL_CONTEXT_PLATFORM;
-      properties[1] = (cl_context_properties) device_param->opencl_platform;
-      properties[2] = 0;
-
-      CL_rc = hc_clCreateContext (hashcat_ctx, properties, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context);
-      */
-
-      if (hc_clCreateContext (hashcat_ctx, NULL, 1, &device_param->opencl_device, NULL, NULL, &device_param->opencl_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-
-      /**
-       * create command-queue
-       */
-
-      // not supported with NV
-      // device_param->opencl_command_queue = hc_clCreateCommandQueueWithProperties (hashcat_ctx, device_param->opencl_device, NULL);
-
-      if (hc_clCreateCommandQueue (hashcat_ctx, device_param->opencl_context, device_param->opencl_device, CL_QUEUE_PROFILING_ENABLE, &device_param->opencl_command_queue) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
+    // re-using context/command-queue, there is no need to re-initialize them
 
     /**
      * create stream for CUDA devices
@@ -10631,6 +10736,13 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
+      if (hc_cuCtxPushCurrent (hashcat_ctx, device_param->cuda_context) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
       if (hc_cuStreamCreate (hashcat_ctx, &device_param->cuda_stream, CU_STREAM_DEFAULT) == -1)
       {
         device_param->skipped = true;
@@ -10645,7 +10757,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipStreamCreate (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
+      if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1)
+      {
+        device_param->skipped = true;
+
+        continue;
+      }
+
+      if (hc_hipStreamCreateWithFlags (hashcat_ctx, &device_param->hip_stream, hipStreamDefault) == -1)
       {
         device_param->skipped = true;
 
@@ -10687,21 +10806,21 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event1, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event2, hipEventBlockingSync) == -1)
       {
         device_param->skipped = true;
 
         continue;
       }
 
-      if (hc_hipEventCreate (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
+      if (hc_hipEventCreateWithFlags (hashcat_ctx, &device_param->hip_event3, hipEventDisableTiming) == -1)
       {
         device_param->skipped = true;
 
@@ -10930,9 +11049,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     // we don't have sm_* on vendors not NV but it doesn't matter
 
     #if defined (DEBUG)
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D HAS_SHFW=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->has_shfw, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
     #else
-    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
+    build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VADD_CO=%u -D HAS_VADDC_CO=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VSUB_CO=%u -D HAS_VSUBB_CO=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D HAS_SHFW=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D ATTACK_MODE=%u -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vadd_co, device_param->has_vaddc_co, device_param->has_vsub, device_param->has_vsubb, device_param->has_vsub_co, device_param->has_vsubb_co, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->has_shfw, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern, user_options->attack_mode);
     #endif
 
     build_options_buf[build_options_len] = 0;
@@ -11793,16 +11912,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param,   size_kernel_params)      == -1) return -1;
 
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size, device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests,            device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts,              device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
 
       /**
        * special buffers
@@ -11829,7 +11948,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_rules_c, size_rules_c) == -1) return -1;
           }
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules_src, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_rules, straight_ctx->kernel_rules_buf, size_rules_src) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -11864,19 +11983,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_esalt_bufs, size_esalts) == -1) return -1;
 
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests, device_param->cuda_stream) == -1) return -1;
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts,   device_param->cuda_stream)   == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts)   == -1) return -1;
 
         if (size_esalts)
         {
           if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_esalts_buf, size_st_esalts) == -1) return -1;
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
         }
       }
     }
@@ -11904,16 +12023,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf,   size_st_salts)           == -1) return -1;
       if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param,   size_kernel_params)      == -1) return -1;
 
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size, device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_digests_buf, hashes->digests_buf,     size_digests,            device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_salt_bufs,   hashes->salts_buf,       size_salts,              device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_a, bitmap_ctx->bitmap_s1_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_b, bitmap_ctx->bitmap_s1_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_c, bitmap_ctx->bitmap_s1_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s1_d, bitmap_ctx->bitmap_s1_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_a, bitmap_ctx->bitmap_s2_a, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_b, bitmap_ctx->bitmap_s2_b, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_c, bitmap_ctx->bitmap_s2_c, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bitmap_s2_d, bitmap_ctx->bitmap_s2_d, bitmap_ctx->bitmap_size) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_digests_buf, hashes->digests_buf,     size_digests)            == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_salt_bufs,   hashes->salts_buf,       size_salts)              == -1) return -1;
 
       /**
        * special buffers
@@ -11940,7 +12059,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
             if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_rules_c, size_rules_c) == -1) return -1;
           }
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_rules, straight_ctx->kernel_rules_buf, size_rules_src, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_rules, straight_ctx->kernel_rules_buf, size_rules_src) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -11975,19 +12094,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_esalt_bufs, size_esalts) == -1) return -1;
 
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_esalt_bufs, hashes->esalts_buf, size_esalts, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_esalt_bufs, hashes->esalts_buf, size_esalts) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_digests_buf, hashes->st_digests_buf, size_st_digests, device_param->hip_stream) == -1) return -1;
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts,   device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_digests_buf, hashes->st_digests_buf, size_st_digests) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_salts_buf,   hashes->st_salts_buf,   size_st_salts) == -1) return -1;
 
         if (size_esalts)
         {
           if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_esalts_buf, size_st_esalts) == -1) return -1;
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_st_esalts_buf, hashes->st_esalts_buf, size_st_esalts) == -1) return -1;
         }
       }
     }
@@ -12125,15 +12244,15 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_st_salts,           NULL, &device_param->opencl_d_st_salts_buf)   == -1) return -1;
       if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY,   size_kernel_params,      NULL, &device_param->opencl_d_kernel_param)   == -1) return -1;
 
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_FALSE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_FALSE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s1_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s1_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_a, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_a, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_b, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_b, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_c, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_c, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bitmap_s2_d, CL_TRUE, 0, bitmap_ctx->bitmap_size, bitmap_ctx->bitmap_s2_d, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_digests_buf, CL_TRUE, 0, size_digests,            hashes->digests_buf,     0, NULL, NULL) == -1) return -1;
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_salt_bufs,   CL_FALSE, 0, size_salts,              hashes->salts_buf,       0, NULL, NULL) == -1) return -1;
 
       /**
@@ -12151,7 +12270,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules,   NULL, &device_param->opencl_d_rules)   == -1) return -1;
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_rules_c, NULL, &device_param->opencl_d_rules_c) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_FALSE, 0, size_rules_src, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_rules, CL_TRUE, 0, size_rules_src, straight_ctx->kernel_rules_buf, 0, NULL, NULL) == -1) return -1;
         }
         else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
         {
@@ -12174,7 +12293,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_esalts, NULL, &device_param->opencl_d_esalt_bufs) == -1) return -1;
 
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_FALSE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_esalt_bufs, CL_TRUE, 0, size_esalts, hashes->esalts_buf, 0, NULL, NULL) == -1) return -1;
       }
 
       if (hashconfig->st_hash != NULL)
@@ -12186,7 +12305,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         {
           if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_esalts, NULL, &device_param->opencl_d_st_esalts_buf) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_FALSE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_st_esalts_buf, CL_TRUE, 0, size_st_esalts, hashes->st_esalts_buf, 0, NULL, NULL) == -1) return -1;
         }
       }
 
@@ -15948,6 +16067,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         u32 threads_per_block_with_regs = (floor) ((float) device_param->regsPerBlock / num_regs);
 
+        if (threads_per_block_with_regs == 0)
+        {
+          // prevent threads_per_block from resulting in 0 due to a bug on the runtime
+          threads_per_block_with_regs = threads_per_block;
+        }
+
         if (threads_per_block_with_regs > device_param->kernel_preferred_wgs_multiple) threads_per_block_with_regs -= threads_per_block_with_regs % device_param->kernel_preferred_wgs_multiple;
 
         threads_per_block = MIN (threads_per_block, threads_per_block_with_regs);
@@ -15967,6 +16092,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       {
         u32 threads_per_block_with_regs = (floor) ((float) device_param->regsPerBlock / num_regs);
 
+        if (threads_per_block_with_regs == 0)
+        {
+          // https://rocm.docs.amd.com/projects/HIP/en/docs-develop/doxygen/html/bug.html
+          // HIP-Clang always returns 0 for regsPerBlock due to a known bug
+          // prevent threads_per_block from resulting in 0, otherwise hashcat crashes
+          threads_per_block_with_regs = threads_per_block;
+        }
+
         if (threads_per_block_with_regs > device_param->kernel_preferred_wgs_multiple) threads_per_block_with_regs -= threads_per_block_with_regs % device_param->kernel_preferred_wgs_multiple;
 
         threads_per_block = MIN (threads_per_block, threads_per_block_with_regs);
@@ -15987,6 +16120,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       threads_per_block = device_param->kernel_preferred_wgs_multiple;
     }
 
+    if (user_options->kernel_threads_chgd == true)
+    {
+      if (threads_per_block < user_options->kernel_threads)
+      {
+        event_log_warning (hashcat_ctx, "* Device #%u: The requested thread size '%d' exceeds the recommended limit of the backend runtime '%d'.", device_id + 1, user_options->kernel_threads, threads_per_block);
+      }
+    }
+
     if ((threads_per_block >= device_param->kernel_threads_min) && (threads_per_block <= device_param->kernel_threads_max))
     {
       //printf ("auto thread max: %d\n", threads_per_block);
@@ -16042,10 +16183,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       device_param->overtune_unfriendly = true;
     }
 
-    //    device_param->kernel_threads = kernel_threads;
-    device_param->kernel_threads = 0;
 
-    u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * device_param->kernel_threads_max;
+    device_param->kernel_threads = 0;
+    device_param->kernel_accel = 0;
+
+    u32 kernel_threads_min = device_param->kernel_threads_min;
+    u32 kernel_threads_max = device_param->kernel_threads_max;
 
     u32 kernel_accel_min = device_param->kernel_accel_min;
     u32 kernel_accel_max = device_param->kernel_accel_max;
@@ -16058,12 +16201,23 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     {
       const u64 GiB4 = 4ULL * 1024 * 1024 * 1024;
 
-      event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB");
+      event_log_warning (hashcat_ctx, "Couldn't query the OS for free memory, assuming 4GiB is available per compute device");
 
       accel_limit_host = GiB4;
     }
     else
     {
+      if (user_options->backend_devices_keepfree)
+      {
+        accel_limit_host = ((u64) accel_limit_host * (100 - user_options->backend_devices_keepfree)) / 100;
+      }
+      else
+      {
+        accel_limit_host = accel_limit_host - (accel_limit_host * 0.34);
+      }
+
+      accel_limit_host /= backend_ctx->backend_devices_active;
+
       // even tho let's not be greedy
 
       const u64 GiB8 = 8ULL * 1024 * 1024 * 1024;
@@ -16094,27 +16248,29 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       if (device_param->is_hip    == true) local_size_bytes = hip_query_max_local_size_bytes    (hashcat_ctx, device_param);
       if (device_param->is_opencl == true) local_size_bytes = opencl_query_max_local_size_bytes (hashcat_ctx, device_param);
       // metal todo
-
-      // use this parameter to tune down kernel_accel_max, because it has such a huge impact on memory requirement
-      // let's target a maximum use of memory of 8GiB so that there's some room left for other stuff
-
-      if (local_size_bytes)
-      {
-        const u64 SIZE_8GiB = 8ULL * 1024 * 1024 * 1024;
-
-        const u64 max_accel = SIZE_8GiB / (hardware_power_max * local_size_bytes);
-
-        kernel_accel_max = MIN (kernel_accel_max, max_accel);
-      }
     }
 
     const u64 size_device_extra1234 = size_extra_buffer1 + size_extra_buffer2 + size_extra_buffer3 + size_extra_buffer4;
 
-    const u64 size_device_extra = MAX ((1024 * 1024 * 1024), size_device_extra1234);
+    // Still not 100% sure about the 64MiB here
 
-    while (kernel_accel_max >= kernel_accel_min)
+    const u64 size_device_extra = MAX ((64ULL * 1024 * 1024), size_device_extra1234);
+
+    // we will first decrease accel and when reached that limit, we will decrease threads
+    // when we decrease limit this will restore accel_max
+
+    int memory_limit_hit = 0;
+
+    const u32 kernel_accel_max_sav = kernel_accel_max;
+
+    while ((kernel_accel_max >= kernel_accel_min) || (kernel_threads_max >= kernel_threads_min))
     {
-      const u64 kernel_power_max = hardware_power_max * kernel_accel_max;
+      const u64 device_processors = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors);
+      const u64 kernel_threads    = ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_max);
+
+      const u64 kernel_power_max = device_processors * kernel_threads * kernel_accel_max;
+
+      // size_spilling
 
       const u64 size_spilling = kernel_power_max * local_size_bytes;
 
@@ -16161,21 +16317,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       // now check if all device-memory sizes which depend on the kernel_accel_max amplifier are within its boundaries
       // if not, decrease amplifier and try again
 
-      int memory_limit_hit = 0;
+      memory_limit_hit = 0;
 
       // sometimes device_available_mem and device_maxmem_alloc reported back from the opencl runtime are a bit inaccurate.
       // let's add some extra space just to be sure.
       // now depends on the kernel-accel value (where scrypt and similar benefits), but also hard minimum 64mb and maximum 1024mb limit
       // let's see if we still need this now that we have low-level API to report free memory
 
-      u64 EXTRA_SPACE = 4096; //(1024ULL * 1024ULL) * kernel_accel_max;
-
-      //EXTRA_SPACE = MAX (EXTRA_SPACE, ( 256ULL * 1024ULL * 1024ULL));
-      //EXTRA_SPACE = MIN (EXTRA_SPACE, (1024ULL * 1024ULL * 1024ULL));
-
-      if ((size_pws   + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_tmps  + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
-      if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_pws   > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_tmps  > device_param->device_maxmem_alloc) memory_limit_hit = 1;
+      if (size_hooks > device_param->device_maxmem_alloc) memory_limit_hit = 1;
 
       // work around, for some reason apple opencl can't have buffers larger 2^31
       // typically runs into trap 6
@@ -16253,14 +16404,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_kernel_params
         + size_spilling;
 
-      if ((size_total + EXTRA_SPACE) > device_param->device_available_mem) memory_limit_hit = 1;
-
-      if (memory_limit_hit == 1)
-      {
-        kernel_accel_max--;
-
-        continue;
-      }
+      if (size_total > device_param->device_available_mem) memory_limit_hit = 1;
 
       const u64 size_host_extra = (512 * 1024 * 1024);
 
@@ -16276,11 +16420,29 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
         + size_pws_base
         + size_host_extra;
 
-      if ((size_total_host + EXTRA_SPACE) > accel_limit_host) memory_limit_hit = 1;
+      if (size_total_host > accel_limit_host) memory_limit_hit = 1;
+
+      //printf ("%zu %zu %d %d\n", size_total, device_param->device_available_mem, kernel_accel_max, kernel_threads_max);
 
       if (memory_limit_hit == 1)
       {
-        kernel_accel_max--;
+        if (kernel_accel_max == kernel_accel_min)
+        {
+          if ((kernel_threads_max > kernel_threads_min) && (kernel_threads_max >= (device_param->kernel_preferred_wgs_multiple * 2)))
+          {
+            kernel_threads_max -= device_param->kernel_preferred_wgs_multiple;
+
+            kernel_accel_max = kernel_accel_max_sav;
+          }
+          else
+          {
+            break;
+          }
+        }
+        else
+        {
+          kernel_accel_max--;
+        }
 
         continue;
       }
@@ -16290,13 +16452,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       break;
     }
 
-    if (kernel_accel_max < kernel_accel_min)
+    if (memory_limit_hit == 1)
     {
       event_log_error (hashcat_ctx, "* Device #%u: Not enough allocatable device memory or free host memory for mapping.", device_id + 1);
 
       backend_memory_hit_warnings++;
 
       device_param->skipped_warning = true;
+
       continue;
     }
 
@@ -16307,7 +16470,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
     {
       while (kernel_accel_max > kernel_accel_min)
       {
-        const u64 kernel_power_max = hardware_power_max * kernel_accel_max;
+        const u64 kernel_power_max = device_param->device_processors * kernel_accel_max;
 
         if (kernel_power_max > hashes->salts_cnt)
         {
@@ -16320,6 +16483,12 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
+    device_param->kernel_threads_min = kernel_threads_min;
+    device_param->kernel_threads_max = kernel_threads_max;
+
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads_max);
+
     device_param->kernel_accel_min = kernel_accel_min;
     device_param->kernel_accel_max = kernel_accel_max;
 
@@ -16684,16 +16853,6 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
       }
     }
 
-    if (device_param->is_hip == true)
-    {
-      if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1)
-      {
-        device_param->skipped = true;
-
-        continue;
-      }
-    }
-
     hardware_power_all += hardware_power_max;
 
     EVENT_DATA (EVENT_BACKEND_DEVICE_INIT_POST, &backend_devices_idx, sizeof (int));
@@ -16815,7 +16974,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->cuda_module_amp)       hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_amp);
       if (device_param->cuda_module_shared)    hc_cuModuleUnload (hashcat_ctx, device_param->cuda_module_shared);
 
-      if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
+      //if (device_param->cuda_context)          hc_cuCtxDestroy (hashcat_ctx, device_param->cuda_context);
 
       device_param->cuda_d_pws_buf            = 0;
       device_param->cuda_d_pws_amp_buf        = 0;
@@ -16892,7 +17051,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->cuda_module_amp           = NULL;
       device_param->cuda_module_shared        = NULL;
 
-      device_param->cuda_context              = NULL;
+      //device_param->cuda_context              = NULL;
     }
 
     if (device_param->is_hip == true)
@@ -16946,8 +17105,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->hip_module_amp)       hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_amp);
       if (device_param->hip_module_shared)    hc_hipModuleUnload (hashcat_ctx, device_param->hip_module_shared);
 
-      if (device_param->hip_context)          hc_hipCtxDestroy (hashcat_ctx, device_param->hip_context);
-
       device_param->hip_d_pws_buf            = 0;
       device_param->hip_d_pws_amp_buf        = 0;
       device_param->hip_d_pws_comp_buf       = 0;
@@ -17022,8 +17179,6 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->hip_module_mp            = NULL;
       device_param->hip_module_amp           = NULL;
       device_param->hip_module_shared        = NULL;
-
-      device_param->hip_context              = NULL;
     }
 
     #if defined (__APPLE__)
@@ -17098,7 +17253,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->metal_library_amp)            hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_amp);
       if (device_param->metal_library_shared)         hc_mtlReleaseLibrary (hashcat_ctx, device_param->metal_library_shared);
 
-      if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
+      //if (device_param->metal_command_queue)          hc_mtlReleaseCommandQueue (hashcat_ctx, device_param->metal_command_queue);
 
       //if (device_param->metal_device)                 hc_mtlReleaseDevice (hashcat_ctx, device_param->metal_device);
 
@@ -17168,7 +17323,7 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->metal_library_mp             = NULL;
       device_param->metal_library_amp            = NULL;
       device_param->metal_library_shared         = NULL;
-      device_param->metal_command_queue          = NULL;
+      //device_param->metal_command_queue          = NULL;
       //device_param->metal_device                 = NULL;
     }
     #endif // __APPLE__
@@ -17244,9 +17399,9 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       if (device_param->opencl_program_amp)      hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_amp);
       if (device_param->opencl_program_shared)   hc_clReleaseProgram (hashcat_ctx, device_param->opencl_program_shared);
 
-      if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
+      //if (device_param->opencl_command_queue)    hc_clReleaseCommandQueue (hashcat_ctx, device_param->opencl_command_queue);
 
-      if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
+      //if (device_param->opencl_context)          hc_clReleaseContext (hashcat_ctx, device_param->opencl_context);
 
       device_param->opencl_d_pws_buf           = NULL;
       device_param->opencl_d_pws_amp_buf       = NULL;
@@ -17314,8 +17469,8 @@ void backend_session_destroy (hashcat_ctx_t *hashcat_ctx)
       device_param->opencl_program_mp          = NULL;
       device_param->opencl_program_amp         = NULL;
       device_param->opencl_program_shared      = NULL;
-      device_param->opencl_command_queue       = NULL;
-      device_param->opencl_context             = NULL;
+      //device_param->opencl_command_queue       = NULL;
+      //device_param->opencl_context             = NULL;
     }
 
     device_param->h_tmps              = NULL;
@@ -17452,14 +17607,14 @@ int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->cuda_stream)   == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->hip_stream)   == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -17473,7 +17628,7 @@ int backend_session_update_mp (hashcat_ctx_t *hashcat_ctx)
     if (device_param->is_opencl == true)
     {
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_FALSE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_FALSE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE,  0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
 
       if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
     }
@@ -17508,14 +17663,14 @@ int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->cuda_stream) == -1) return -1;
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css,   device_param->hip_stream) == -1) return -1;
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css, device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_root_css_buf,   mask_ctx->root_css_buf,   device_param->size_root_css)   == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_markov_css_buf, mask_ctx->markov_css_buf, device_param->size_markov_css) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -17529,7 +17684,7 @@ int backend_session_update_mp_rl (hashcat_ctx_t *hashcat_ctx, const u32 css_cnt_
     if (device_param->is_opencl == true)
     {
       if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_root_css_buf,   CL_FALSE, 0, device_param->size_root_css,   mask_ctx->root_css_buf,   0, NULL, NULL) == -1) return -1;
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_FALSE, 0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_markov_css_buf, CL_TRUE,  0, device_param->size_markov_css, mask_ctx->markov_css_buf, 0, NULL, NULL) == -1) return -1;
 
       if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
     }
diff --git a/src/bitmap.c b/src/bitmap.c
index 367b16a7c..1652b7cb9 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -81,8 +81,8 @@ int bitmap_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/bridges.c b/src/bridges.c
index 9b37aa25a..3df210f80 100644
--- a/src/bridges.c
+++ b/src/bridges.c
@@ -87,12 +87,12 @@ bool bridges_init (hashcat_ctx_t *hashcat_ctx)
   user_options_t  *user_options = hashcat_ctx->user_options;
   hashconfig_t    *hashconfig   = hashcat_ctx->hashconfig;
 
-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
   if (user_options->left         == true) return true;
   if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
   if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;
 
   // There is a problem here. At this point, hashconfig is not yet initialized.
   // This is because initializing hashconfig requires the module to be loaded,
@@ -241,12 +241,12 @@ bool bridges_salt_prepare (hashcat_ctx_t *hashcat_ctx)
   hashes_t        *hashes       = hashcat_ctx->hashes;
   user_options_t  *user_options = hashcat_ctx->user_options;
 
-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
   if (user_options->left         == true) return true;
   if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
   if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;
 
   if (bridge_ctx->enabled == false) return true;
 
diff --git a/src/combinator.c b/src/combinator.c
index 654ada03f..47715ff5e 100644
--- a/src/combinator.c
+++ b/src/combinator.c
@@ -21,8 +21,8 @@ int combinator_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
diff --git a/src/cpt.c b/src/cpt.c
index 8dc393c3a..97627b8c0 100644
--- a/src/cpt.c
+++ b/src/cpt.c
@@ -17,8 +17,8 @@ int cpt_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/debugfile.c b/src/debugfile.c
index 20ee3ad1f..25b0a6033 100644
--- a/src/debugfile.c
+++ b/src/debugfile.c
@@ -118,9 +118,9 @@ int debugfile_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/dictstat.c b/src/dictstat.c
index a607cf7d6..416eb90b3 100644
--- a/src/dictstat.c
+++ b/src/dictstat.c
@@ -58,9 +58,9 @@ int dictstat_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/dispatch.c b/src/dispatch.c
index ad7c9851b..9ac171f31 100644
--- a/src/dispatch.c
+++ b/src/dispatch.c
@@ -381,7 +381,7 @@ HC_API_CALL void *thread_calc_stdin (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc_stdin (hashcat_ctx, device_param) == -1)
@@ -396,11 +396,6 @@ HC_API_CALL void *thread_calc_stdin (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
@@ -1685,7 +1680,7 @@ HC_API_CALL void *thread_calc (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
   if (calc (hashcat_ctx, device_param) == -1)
@@ -1700,11 +1695,6 @@ HC_API_CALL void *thread_calc (void *p)
     if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
   }
 
-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
   if (bridge_ctx->enabled == true)
   {
     if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
diff --git a/src/ext_cuda.c b/src/ext_cuda.c
index 4c0db9ffb..878977ade 100644
--- a/src/ext_cuda.c
+++ b/src/ext_cuda.c
@@ -87,14 +87,19 @@ int cuda_init (void *hashcat_ctx)
   HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD,             cuMemcpyDtoD_v2,           CUDA_CUMEMCPYDTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH,             cuMemcpyDtoH_v2,           CUDA_CUMEMCPYDTOH,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD,             cuMemcpyHtoD_v2,           CUDA_CUMEMCPYHTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32,              cuMemsetD32,               CUDA_CUMEMSETD32,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8,               cuMemsetD8,                CUDA_CUMEMSETD8,                CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync,        cuMemcpyDtoDAsync_v2,      CUDA_CUMEMCPYDTODASYNC,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync,        cuMemcpyDtoHAsync_v2,      CUDA_CUMEMCPYDTOHASYNC,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync,        cuMemcpyHtoDAsync_v2,      CUDA_CUMEMCPYHTODASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
   HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
@@ -517,6 +522,142 @@ int hc_cuMemFree (void *hashcat_ctx, CUdeviceptr dptr)
   return 0;
 }
 
+
+int hc_cuMemcpyDtoH (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoD (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyHtoD (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD32 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD32 (dstDevice, ui, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD8 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD8 (dstDevice, uc, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_cuMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/ext_hip.c b/src/ext_hip.c
index decfa987c..9e7eb00ab 100644
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@@ -115,47 +115,55 @@ int hip_init (void *hashcat_ctx)
   // a good reference is cuda.h itself
   // this needs to be verified for each new cuda release
 
-  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,               HIP_HIPCTXCREATE,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,              HIP_HIPCTXDESTROY,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,           HIP_HIPCTXPOPCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,          HIP_HIPCTXPUSHCURRENT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,           HIP_HIPCTXSETCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,          HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,               HIP_HIPDEVICEGET,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,      HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,          HIP_HIPDEVICEGETCOUNT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,           HIP_HIPDEVICEGETNAME,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,          HIP_HIPDEVICETOTALMEM,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,        HIP_HIPDRIVERGETVERSION,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreateWithFlags,    HIP_HIPEVENTCREATE,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,            HIP_HIPEVENTDESTROY,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,        HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,         HIP_HIPGETERRORNAME,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,       HIP_HIPGETERRORSTRING,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,         HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,         HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,          HIP_HIPMEMSETD32ASYNC,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,           HIP_HIPMEMSETD8ASYNC,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,            HIP_HIPMODULEUNLOAD,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,       HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,     HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,    hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,     HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,                 HIP_HIPCTXCREATE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,                HIP_HIPCTXDESTROY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,             HIP_HIPCTXPOPCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,            HIP_HIPCTXPUSHCURRENT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,             HIP_HIPCTXSETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,            HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,                 HIP_HIPDEVICEGET,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,        HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,            HIP_HIPDEVICEGETCOUNT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,             HIP_HIPDEVICEGETNAME,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,            HIP_HIPDEVICETOTALMEM,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,          HIP_HIPDRIVERGETVERSION,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreate,               HIP_HIPEVENTCREATE,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreateWithFlags,   hipEventCreateWithFlags,      HIP_HIPEVENTCREATEWITHFLAGS,    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,              HIP_HIPEVENTDESTROY,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,          HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,               HIP_HIPEVENTRECORD,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,          HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,          HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,           HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,         HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                      HIP_HIPINIT,                    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,        HIP_HIPLAUNCHKERNEL,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                    HIP_HIPMEMALLOC,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                      HIP_HIPMEMFREE,                 HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,                HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,                HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,                HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,                HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,                 HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                  HIP_HIPMEMSETD8,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,           HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,           HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,           HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,            HIP_HIPMEMSETD32ASYNC,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,             HIP_HIPMEMSETD8ASYNC,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,         HIP_HIPMODULEGETFUNCTION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,           HIP_HIPMODULEGETGLOBAL,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,          HIP_HIPMODULELOADDATAEX,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,              HIP_HIPMODULEUNLOAD,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,         HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDevice,              hipSetDevice,                 HIP_HIPSETDEVICE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDeviceFlags,         hipSetDeviceFlags,            HIP_HIPSETDEVICEFLAGS,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,              HIP_HIPSTREAMCREATE,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreateWithFlags,  hipStreamCreateWithFlags,     HIP_HIPSTREAMCREATEWITHFLAGS,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,             HIP_HIPSTREAMDESTROY,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,         HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,  HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR, HIP, 1);
 
   return 0;
 }
@@ -503,13 +511,13 @@ int hc_hipDriverGetVersion (void *hashcat_ctx, int *driverVersion)
   return 0;
 }
 
-int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags)
+int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipEventCreate (phEvent, Flags);
+  const hipError_t HIP_err = hip->hipEventCreate (phEvent);
 
   if (HIP_err != hipSuccess)
   {
@@ -530,6 +538,33 @@ int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flag
   return 0;
 }
 
+int hc_hipEventCreateWithFlags (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipEventCreateWithFlags (phEvent, flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipEventDestroy (void *hashcat_ctx, hipEvent_t hEvent)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@@ -800,6 +835,143 @@ int hc_hipMemGetInfo (void *hashcat_ctx, size_t *free, size_t *total)
   return 0;
 }
 
+
+
+int hc_hipMemcpyDtoH (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyDtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyHtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD32 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD32 (dstDevice, ui, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD8 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD8 (dstDevice, uc, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@@ -1070,13 +1242,67 @@ int hc_hipRuntimeGetVersion (void *hashcat_ctx, int *runtimeVersion)
   return 0;
 }
 
-int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+int hc_hipSetDevice (void *hashcat_ctx, hipDevice_t dev)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
 
   HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
 
-  const hipError_t HIP_err = hip->hipStreamCreate (phStream, Flags);
+  const hipError_t HIP_err = hip->hipSetDevice (dev);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipSetDeviceFlags (void *hashcat_ctx, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipSetDeviceFlags (flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreate (phStream);
 
   if (HIP_err != hipSuccess)
   {
@@ -1097,6 +1323,33 @@ int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int F
   return 0;
 }
 
+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreateWithFlags (phStream, Flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipStreamDestroy (void *hashcat_ctx, hipStream_t hStream)
 {
   backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
diff --git a/src/ext_metal.m b/src/ext_metal.m
index 6a51d48bd..a05c1a0e7 100644
--- a/src/ext_metal.m
+++ b/src/ext_metal.m
@@ -195,11 +195,14 @@ static int hc_mtlBuildOptionsToDict (void *hashcat_ctx, const char *build_option
   }
 
   // if set, add INCLUDE_PATH to hack Apple kernel build from source limitation on -I usage
+
   if (include_path != nil)
   {
     NSString *path_key = @"INCLUDE_PATH";
     NSString *path_value = [NSString stringWithCString: include_path encoding: NSUTF8StringEncoding];
+
     // Include path may contain spaces, escape them with a backslash
+
     path_value = [path_value stringByReplacingOccurrencesOfString:@" " withString:@"\\ "];
 
     [build_options_dict setObject:path_value forKey:path_key];
@@ -743,6 +746,7 @@ int hc_mtlCreateKernel (void *hashcat_ctx, mtl_device_id metal_device, mtl_libra
   dispatch_queue_t queue = dispatch_get_global_queue (DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
 
   // if no user-defined runtime, set to METAL_COMPILER_RUNTIME
+
   long timeout = (user_options->metal_compiler_runtime > 0) ? user_options->metal_compiler_runtime : METAL_COMPILER_RUNTIME;
 
   dispatch_time_t when = dispatch_time (DISPATCH_TIME_NOW,NSEC_PER_SEC * timeout);
@@ -1314,10 +1318,21 @@ int hc_mtlSetCommandEncoderArg (void *hashcat_ctx, mtl_command_encoder metal_com
   return 0;
 }
 
-int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms)
+int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
 {
-  MTLSize numThreadgroups = {local_work_size, 1, 1};
-  MTLSize threadsGroup = {global_work_size, 1, 1};
+  MTLSize threadsPerThreadgroup =
+  {
+    local_work_size[0],
+    local_work_size[1],
+    local_work_size[2]
+  };
+
+  MTLSize threadgroupsPerGrid =
+  {
+    (global_work_size[0] + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
+    work_dim > 1 ? (global_work_size[1] + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height : 1,
+    work_dim > 2 ? (global_work_size[2] + threadsPerThreadgroup.depth - 1) / threadsPerThreadgroup.depth : 1
+  };
 
   if (metal_command_encoder == nil)
   {
@@ -1333,7 +1348,7 @@ int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_com
     return -1;
   }
 
-  [metal_command_encoder dispatchThreadgroups: threadsGroup threadsPerThreadgroup: numThreadgroups];
+  [metal_command_encoder dispatchThreadgroups: threadgroupsPerGrid threadsPerThreadgroup: threadsPerThreadgroup];
 
   [metal_command_encoder endEncoding];
   [metal_command_buffer commit];
@@ -1377,17 +1392,22 @@ int hc_mtlCreateLibraryWithFile (void *hashcat_ctx, mtl_device_id metal_device,
 
   if (k_string != nil)
   {
-    id <MTLLibrary> r = [metal_device newLibraryWithFile: k_string error: &error];
+    NSURL *libURL = [NSURL fileURLWithPath: k_string];
 
-    if (error != nil)
+    if (libURL != nil)
     {
-      event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
-      return -1;
+      id <MTLLibrary> r = [metal_device newLibraryWithURL: libURL error:&error];
+
+      if (error != nil)
+      {
+        event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
+        return -1;
+      }
+
+      *metal_library = r;
+
+      return 0;
     }
-
-    *metal_library = r;
-
-    return 0;
   }
 
   return -1;
@@ -1420,10 +1440,17 @@ int hc_mtlCreateLibraryWithSource (void *hashcat_ctx, mtl_device_id metal_device
       }
 
       compileOptions.preprocessorMacros = build_options_dict;
+      /*
+      compileOptions.optimizationLevel = MTLLibraryOptimizationLevelSize;
+      compileOptions.mathMode = MTLMathModeSafe;
+      // compileOptions.mathMode = MTLMathModeRelaxed;
+      // compileOptions.enableLogging = true;
+      // compileOptions.fastMathEnabled = false;
+      */
     }
 
     // todo: detect current os version and choose the right
-//    compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
+    // compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
 /*
     if (@available(macOS 12.0, *))
     {
diff --git a/src/hashcat.c b/src/hashcat.c
index 5f54fd68e..98ba4b5ba 100644
--- a/src/hashcat.c
+++ b/src/hashcat.c
@@ -132,6 +132,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
   status_ctx->words_base = status_ctx->words_cnt / amplifier_cnt;
 
   EVENT (EVENT_CALCULATED_WORDS_BASE);
+  EVENT (EVENT_CALCULATED_WORDS_CNT);
 
   if (user_options->keyspace == true)
   {
diff --git a/src/hashes.c b/src/hashes.c
index 78a6d5b90..767c30824 100644
--- a/src/hashes.c
+++ b/src/hashes.c
@@ -334,7 +334,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_cuda == true)
     {
-      rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->cuda_stream);
+      rc = hc_cuMemcpyDtoH (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
 
       if (rc == 0)
       {
@@ -351,7 +351,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_hip == true)
     {
-      rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->hip_stream);
+      rc = hc_hipMemcpyDtoH (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);
 
       if (rc == 0)
       {
@@ -382,7 +382,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla
 
     if (device_param->is_opencl == true)
     {
-      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_FALSE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
+      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
 
       if (rc == 0)
       {
@@ -587,14 +587,14 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_cuda == true)
   {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
   }
@@ -624,7 +624,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_cuda == true)
   {
-    rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->cuda_stream);
+    rc = hc_cuMemcpyDtoH (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t));
 
     if (rc == 0)
     {
@@ -641,7 +641,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 
   if (device_param->is_hip == true)
   {
-    rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->hip_stream);
+    rc = hc_hipMemcpyDtoH (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t));
 
     if (rc == 0)
     {
@@ -1133,7 +1133,7 @@ int hashes_init_stage1 (hashcat_ctx_t *hashcat_ctx)
 
     hashes_cnt = 1;
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
   }
   else if (user_options->keyspace == true)
@@ -2386,13 +2386,13 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)
 
         char *st_hash = strdup (tmp_buf);
 
-        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test configuration detected.");
 
         event_log_warning (hashcat_ctx, "The specified target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of the self-test hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of the self-test hash:");
         event_log_warning (hashcat_ctx, "  -> %s", st_hash);
-        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this SCRYPT configuration may be incompatible.");
+        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this configuration may be incompatible.");
         event_log_warning (hashcat_ctx, "You must disable the self-test functionality or recompile the plugin with a matching self-test hash.");
         event_log_warning (hashcat_ctx, "To disable the self-test, use the --self-test-disable option.");
         event_log_warning (hashcat_ctx, NULL);
@@ -2414,11 +2414,11 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)
 
         char *user_hash2 = strdup (tmp_buf);
 
-        event_log_error (hashcat_ctx, "ERROR: Mixed SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Mixed configuration detected.");
 
         event_log_warning (hashcat_ctx, "The specified target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of another target hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of another target hash:");
         event_log_warning (hashcat_ctx, "  -> %s", user_hash2);
         event_log_warning (hashcat_ctx, "Please run these hashes in separate cracking sessions.");
         event_log_warning (hashcat_ctx, NULL);
diff --git a/src/hwmon.c b/src/hwmon.c
index 4f5264b3d..7408056ca 100644
--- a/src/hwmon.c
+++ b/src/hwmon.c
@@ -1268,142 +1268,10 @@ u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int ba
   return 0;
 }
 
-int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+static void hwmon_ctx_init_nvml (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvml, int backend_devices_cnt)
 {
-  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
-  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
-  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  hwmon_ctx->enabled = false;
-
-  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
-
-  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
-
-  //#if !defined (WITH_HWMON)
-  //return 0;
-  //#endif // WITH_HWMON
-
-  if (user_options->usage          > 0)     return 0;
-  //if (user_options->backend_info   > 0)     return 0;
-
-  if (user_options->hash_info     == true)  return 0;
-  if (user_options->keyspace      == true)  return 0;
-  if (user_options->left          == true)  return 0;
-  if (user_options->show          == true)  return 0;
-  if (user_options->stdout_flag   == true)  return 0;
-  if (user_options->version       == true)  return 0;
-  if (user_options->identify      == true)  return 0;
-  //we need hwmon support to get free memory per device support
-  //its a joke, but there's no way around
-  //if (user_options->hwmon         == false) return 0;
-
-  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  /**
-   * Initialize shared libraries
-   */
-
-  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  #define FREE_ADAPTERS                \
-  do {                                 \
-    hcfree (hm_adapters_adl);          \
-    hcfree (hm_adapters_nvapi);        \
-    hcfree (hm_adapters_nvml);         \
-    hcfree (hm_adapters_sysfs_amdgpu); \
-    hcfree (hm_adapters_sysfs_cpu);    \
-    hcfree (hm_adapters_iokit);        \
-  } while (0)
-
-  if (backend_ctx->need_nvml == true)
-  {
-    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
-
-    if (nvml_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvml);
-
-      hwmon_ctx->hm_nvml = NULL;
-    }
-  }
-
-  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
-  {
-    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
-
-    if (nvapi_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvapi);
-
-      hwmon_ctx->hm_nvapi = NULL;
-    }
-  }
-
-  if (backend_ctx->need_adl == true)
-  {
-    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
-
-    if (adl_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_adl);
-
-      hwmon_ctx->hm_adl = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_amdgpu == true)
-  {
-    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
-
-    if (sysfs_amdgpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-
-    // also if there's ADL, we don't need sysfs_amdgpu
-
-    if (hwmon_ctx->hm_adl)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_cpu == true)
-  {
-    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
-
-    if (sysfs_cpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_cpu);
-
-      hwmon_ctx->hm_sysfs_cpu = NULL;
-    }
-  }
-
-  #if defined(__APPLE__)
-  if (backend_ctx->need_iokit == true)
-  {
-    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
-
-    if (iokit_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_iokit);
-
-      hwmon_ctx->hm_iokit = NULL;
-    }
-  }
-  #endif
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_nvml)
   {
@@ -1485,6 +1353,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
       hcfree (nvmlGPUHandle);
     }
   }
+}
+
+static void hwmon_ctx_init_nvapi (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvapi, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_nvapi)
   {
@@ -1558,6 +1432,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
       hcfree (nvGPUHandle);
     }
   }
+}
+
+static int hwmon_ctx_init_adl (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_adl, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_adl)
   {
@@ -1567,23 +1447,13 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
       int tmp_in;
 
-      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1) return -1;
 
       // adapter info
 
       LPAdapterInfo lpAdapterInfo = (LPAdapterInfo) hccalloc (tmp_in, sizeof (AdapterInfo));
 
-      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1) return -1;
 
       for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
       {
@@ -1639,107 +1509,260 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
+  return 0;
+}
+
+static void hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_amdgpu, hm_attrs_t *hm_adapters_iokit, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+
   if (hwmon_ctx->hm_sysfs_amdgpu || hwmon_ctx->hm_iokit)
   {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
     {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
       {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+        // nothing to do
+      }
 
-        if (device_param->skipped == true) continue;
+      #if defined (__APPLE__)
+      if (device_param->is_metal == true)
+      {
+        const u32 device_id = device_param->device_id;
 
-        if (device_param->is_cuda == true)
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
         {
-          // nothing to do
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
+        }
+      }
+      #endif
+
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
+        {
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
         }
 
-        #if defined (__APPLE__)
-        if (device_param->is_metal == true)
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_amdgpu)
         {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-        }
-        #endif
-
-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
-        {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_amdgpu)
-          {
-            hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
-          }
+          hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
         }
       }
     }
   }
+}
+
+static void hwmon_ctx_init_sysfs_cpu (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_cpu, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
 
   if (hwmon_ctx->hm_sysfs_cpu)
   {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
     {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
       {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-        if (device_param->skipped == true) continue;
-
-        if (device_param->is_cuda == true)
-        {
           // nothing to do
-        }
+      }
 
-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_cpu)
         {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_cpu)
-          {
-            hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
-            hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
-          }
+          hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
+          hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
         }
       }
     }
   }
+}
+
+int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+{
+  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
+  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  hwmon_ctx->enabled = false;
+
+  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
+
+  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
+
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON
+
+  if (user_options->usage          > 0)     return 0;
+  if (user_options->hash_info      > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;
+
+  if (user_options->keyspace      == true)  return 0;
+  if (user_options->left          == true)  return 0;
+  if (user_options->show          == true)  return 0;
+  if (user_options->stdout_flag   == true)  return 0;
+  if (user_options->version       == true)  return 0;
+  if (user_options->identify      == true)  return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;
+
+  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  /**
+   * Initialize shared libraries
+   */
+
+  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  if (backend_ctx->need_nvml == true)
+  {
+    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
+
+    if (nvml_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvml);
+
+      hwmon_ctx->hm_nvml = NULL;
+    }
+  }
+
+  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
+  {
+    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
+
+    if (nvapi_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvapi);
+
+      hwmon_ctx->hm_nvapi = NULL;
+    }
+  }
+
+  if (backend_ctx->need_adl == true)
+  {
+    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
+
+    if (adl_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_adl);
+
+      hwmon_ctx->hm_adl = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_amdgpu == true)
+  {
+    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
+
+    if (sysfs_amdgpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+      hwmon_ctx->hm_sysfs_amdgpu = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_cpu == true)
+  {
+    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
+
+    if (sysfs_cpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_cpu);
+
+      hwmon_ctx->hm_sysfs_cpu = NULL;
+    }
+  }
+
+  #if defined(__APPLE__)
+  if (backend_ctx->need_iokit == true)
+  {
+    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
+
+    if (iokit_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_iokit);
+
+      hwmon_ctx->hm_iokit = NULL;
+    }
+  }
+  #endif
+
+  hwmon_ctx_init_nvml  (hashcat_ctx, hm_adapters_nvml,  backend_devices_cnt);
+
+  hwmon_ctx_init_nvapi (hashcat_ctx, hm_adapters_nvapi, backend_devices_cnt);
+
+  // if ADL init fail, disable
+
+  if (hwmon_ctx_init_adl (hashcat_ctx, hm_adapters_adl, backend_devices_cnt) == -1)
+  {
+    hcfree (hwmon_ctx->hm_adl);
+
+    hwmon_ctx->hm_adl = NULL;
+  }
+
+  // if there's ADL, we don't need sysfs_amdgpu
+
+  if (hwmon_ctx->hm_adl)
+  {
+    hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+    hwmon_ctx->hm_sysfs_amdgpu = NULL;
+  }
+
+  hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx, hm_adapters_sysfs_amdgpu, hm_adapters_iokit, backend_devices_cnt);
+
+  hwmon_ctx_init_sysfs_cpu (hashcat_ctx, hm_adapters_sysfs_cpu, backend_devices_cnt);
 
   #if defined(__APPLE__)
   if (backend_ctx->need_iokit == true)
@@ -1757,7 +1780,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (hwmon_ctx->hm_adl == NULL && hwmon_ctx->hm_nvml == NULL && hwmon_ctx->hm_sysfs_amdgpu == NULL && hwmon_ctx->hm_sysfs_cpu == NULL && hwmon_ctx->hm_iokit == NULL)
   {
-    FREE_ADAPTERS;
+    hcfree (hm_adapters_adl);
+    hcfree (hm_adapters_nvapi);
+    hcfree (hm_adapters_nvml);
+    hcfree (hm_adapters_sysfs_amdgpu);
+    hcfree (hm_adapters_sysfs_cpu);
+    hcfree (hm_adapters_iokit);
 
     return 0;
   }
@@ -1992,7 +2020,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
     hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
   }
 
-  FREE_ADAPTERS;
+  hcfree (hm_adapters_adl);
+  hcfree (hm_adapters_nvapi);
+  hcfree (hm_adapters_nvml);
+  hcfree (hm_adapters_sysfs_amdgpu);
+  hcfree (hm_adapters_sysfs_cpu);
+  hcfree (hm_adapters_iokit);
 
   return 0;
 }
diff --git a/src/induct.c b/src/induct.c
index 3d9755034..476abe958 100644
--- a/src/induct.c
+++ b/src/induct.c
@@ -41,9 +41,9 @@ int induct_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/interface.c b/src/interface.c
index 3c9c7ed2d..129c054f8 100644
--- a/src/interface.c
+++ b/src/interface.c
@@ -363,7 +363,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx)
 
     hashconfig->has_optimized_kernel = hc_path_read (source_file);
 
-    if (user_options->hash_info == false)
+    if (user_options->hash_info == 0 || user_options->hash_info > 1)
     {
       if (user_options->optimized_kernel == true)
       {
diff --git a/src/loopback.c b/src/loopback.c
index 28dcff0d0..b5799a435 100644
--- a/src/loopback.c
+++ b/src/loopback.c
@@ -62,9 +62,9 @@ int loopback_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage          > 0)    return 0;
   if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
 
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->left          == true) return 0;
   if (user_options->show          == true) return 0;
diff --git a/src/main.c b/src/main.c
index deef5873c..0f32b4369 100644
--- a/src/main.c
+++ b/src/main.c
@@ -192,12 +192,13 @@ static void main_outerloop_starting (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA
 
   status_ctx->shutdown_outer = false;
 
-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
   if (user_options->keyspace     == true) return;
   if (user_options->stdout_flag  == true) return;
   if (user_options->speed_only   == true) return;
   if (user_options->identify     == true) return;
-  if (user_options->backend_info  > 0)    return;
 
   if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
   {
@@ -269,10 +270,11 @@ static void main_cracker_finished (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYB
   const user_options_t       *user_options       = hashcat_ctx->user_options;
   const user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
   if (user_options->keyspace     == true) return;
   if (user_options->stdout_flag  == true) return;
-  if (user_options->backend_info  > 0)    return;
 
   // if we had a prompt, clear it
 
@@ -370,10 +372,22 @@ static void main_calculated_words_base (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx,
   const user_options_t *user_options = hashcat_ctx->user_options;
 
   if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == true) return;
 
   event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_base);
 }
 
+static void main_calculated_words_cnt (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+{
+  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == false) return;
+
+  event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_cnt);
+}
+
 static void main_potfile_remove_parse_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
@@ -631,7 +645,17 @@ static void main_backend_session_hostmem (MAYBE_UNUSED hashcat_ctx_t *hashcat_ct
 
   const u64 *hostmem = (const u64 *) buf;
 
-  event_log_info (hashcat_ctx, "Host memory required for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  u64 free_memory = 0;
+
+  if (get_free_memory (&free_memory) == false)
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  }
+  else
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB (%" PRIu64 " MB free)", *hostmem / (1024 * 1024), free_memory / (1024 * 1024));
+  }
+
   event_log_info (hashcat_ctx, NULL);
 }
 
@@ -1251,6 +1275,7 @@ static void event (const u32 id, hashcat_ctx_t *hashcat_ctx, const void *buf, co
     case EVENT_BRIDGES_SALT_POST:         main_bridges_salt_post         (hashcat_ctx, buf, len); break;
     case EVENT_BRIDGES_SALT_PRE:          main_bridges_salt_pre          (hashcat_ctx, buf, len); break;
     case EVENT_CALCULATED_WORDS_BASE:     main_calculated_words_base     (hashcat_ctx, buf, len); break;
+    case EVENT_CALCULATED_WORDS_CNT:      main_calculated_words_cnt      (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_FINISHED:          main_cracker_finished          (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_HASH_CRACKED:      main_cracker_hash_cracked      (hashcat_ctx, buf, len); break;
     case EVENT_CRACKER_STARTING:          main_cracker_starting          (hashcat_ctx, buf, len); break;
@@ -1408,7 +1433,7 @@ int main (int argc, char **argv)
 
       rc_final = 0;
     }
-    else if (user_options->hash_info == true)
+    else if (user_options->hash_info > 0)
     {
       hash_info (hashcat_ctx);
 
diff --git a/src/modules/argon2_common.c b/src/modules/argon2_common.c
new file mode 100644
index 000000000..cfe47071e
--- /dev/null
+++ b/src/modules/argon2_common.c
@@ -0,0 +1,174 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include <inttypes.h>
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+#include "memory.h"
+
+#define ARGON2_SYNC_POINTS  4
+#define ARGON2_BLOCK_SIZE   1024
+
+typedef struct argon2_tmp
+{
+  u32 state[4]; // just something for now
+
+} argon2_tmp_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+
+  u32 digest_len;
+
+} argon2_options_t;
+
+u32 argon2_module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_min = 32; // hard-coded in kernel
+
+  return kernel_threads_min;
+}
+
+u32 argon2_module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_threads_max = 32; // hard-coded in kernel
+
+  return kernel_threads_max;
+}
+
+u64 argon2_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 tmp_size = 0; // we'll add some later
+
+  return tmp_size;
+}
+
+const char *argon2_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
+{
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  int   lines_sz  = 4096;
+  char *lines_buf = hcmalloc (lines_sz);
+  int   lines_pos = 0;
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
+
+  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
+
+  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel_user)
+  {
+    kernel_accel_new = kernel_accel_user;
+  }
+  else
+  {
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      kernel_accel_new = available_mem / size_per_accel;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
+  return lines_buf;
+}
+
+u64 argon2_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
+
+  return size_argon2;
+}
+
+u64 argon2_module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
+
+  for (u32 i = 1; i < hashes->salts_cnt; i++)
+  {
+    if ((memory_block_count != options[i].memory_block_count)
+     || (parallelism        != options[i].parallelism))
+    {
+      return (1ULL << 63) + i;
+    }
+  }
+
+  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
+
+  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
+  {
+    if ((memory_block_count != options_st->memory_block_count)
+     || (parallelism        != options_st->parallelism))
+    {
+      return (1ULL << 62);
+    }
+  }
+
+  u64 tmp_size = sizeof (argon2_tmp_t);
+
+  return tmp_size;
+}
+
+char *argon2_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  char *jit_build_options = NULL;
+
+  hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
+
+  return jit_build_options;
+}
+
diff --git a/src/modules/module_00020.c b/src/modules/module_00020.c
index 3d514be55..f20a23d75 100644
--- a/src/modules/module_00020.c
+++ b/src/modules/module_00020.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "md5($salt.$pass)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00021.c b/src/modules/module_00021.c
index a9072fb4e..ce19d37ab 100644
--- a/src/modules/module_00021.c
+++ b/src/modules/module_00021.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FORUM_SOFTWARE;
 static const char *HASH_NAME      = "osCommerce, xt:Commerce";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00022.c b/src/modules/module_00022.c
index 4252f2f6d..d5742c8f0 100644
--- a/src/modules/module_00022.c
+++ b/src/modules/module_00022.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "Juniper NetScreen/SSG (ScreenOS)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00023.c b/src/modules/module_00023.c
index b50bf7a84..d6b2f281a 100644
--- a/src/modules/module_00023.c
+++ b/src/modules/module_00023.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_IMS;
 static const char *HASH_NAME      = "Skype";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_00024.c b/src/modules/module_00024.c
index a2c4d2e90..f3da3ba75 100644
--- a/src/modules/module_00024.c
+++ b/src/modules/module_00024.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_EAS;
 static const char *HASH_NAME      = "SolarWinds Serv-U";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_01300.c b/src/modules/module_01300.c
index b50a0e99c..c79a3493c 100644
--- a/src/modules/module_01300.c
+++ b/src/modules/module_01300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH;
 static const char *HASH_NAME      = "SHA2-224";
 static const u64   KERN_TYPE      = 1300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c
index b66dddff1..286eda992 100644
--- a/src/modules/module_01500.c
+++ b/src/modules/module_01500.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4; // originally DGST_SIZE_4_2
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "descrypt, DES (Unix), Traditional DES";
 static const u64   KERN_TYPE      = 1500;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_TM_KERNEL
diff --git a/src/modules/module_01800.c b/src/modules/module_01800.c
index fea4851c6..f0d2e9623 100644
--- a/src/modules/module_01800.c
+++ b/src/modules/module_01800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "sha512crypt $6$, SHA512 (Unix)";
 static const u64   KERN_TYPE      = 1800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_06400.c b/src/modules/module_06400.c
index b7ac9d87f..a553846c7 100644
--- a/src/modules/module_06400.c
+++ b/src/modules/module_06400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "AIX {ssha256}";
 static const u64   KERN_TYPE      = 6400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_06800.c b/src/modules/module_06800.c
index 215f26992..e6cf96fd6 100644
--- a/src/modules/module_06800.c
+++ b/src/modules/module_06800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "LastPass + LastPass sniffed";
 static const u64   KERN_TYPE      = 6800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_07100.c b/src/modules/module_07100.c
index ec3924cec..78c8f8081 100644
--- a/src/modules/module_07100.c
+++ b/src/modules/module_07100.c
@@ -85,15 +85,6 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
   return pw_max;
 }
 
-char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  char *jit_build_options = NULL;
-
-  hc_asprintf (&jit_build_options, "-D NO_UNROLL");
-
-  return jit_build_options;
-}
-
 int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
 {
   u64 *digest = (u64 *) digest_buf;
@@ -398,7 +389,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_hook23                   = MODULE_DEFAULT;
   module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
   module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
   module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
diff --git a/src/modules/module_07700.c b/src/modules/module_07700.c
index 2642a0bb7..1d783d351 100644
--- a/src/modules/module_07700.c
+++ b/src/modules/module_07700.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE)";
 static const u64   KERN_TYPE      = 7700;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_07701.c b/src/modules/module_07701.c
index bc852639b..232d7ca4d 100644
--- a/src/modules/module_07701.c
+++ b/src/modules/module_07701.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE) from RFC_READ_TABLE";
 static const u64   KERN_TYPE      = 7701;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_08200.c b/src/modules/module_08200.c
index 54d5c62b5..90049b1ed 100644
--- a/src/modules/module_08200.c
+++ b/src/modules/module_08200.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "1Password, cloudkeychain";
 static const u64   KERN_TYPE      = 8200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_08300.c b/src/modules/module_08300.c
index c499667ae..80dd3475d 100644
--- a/src/modules/module_08300.c
+++ b/src/modules/module_08300.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_5;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "DNSSEC (NSEC3)";
 static const u64   KERN_TYPE      = 8300;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE
                                   | OPTS_TYPE_ST_HEX
diff --git a/src/modules/module_10700.c b/src/modules/module_10700.c
index 4a7725e52..b0bdd4a41 100644
--- a/src/modules/module_10700.c
+++ b/src/modules/module_10700.c
@@ -152,38 +152,47 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
 
 char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
+  const u32 shared_size_scratch = (32 + 64 + 16); // LOCAL_VK u32 s_sc[FIXED_LOCAL_SIZE][PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
+  const u32 shared_size_aes     = (5 * 1024);     // LOCAL_VK u32 s_te0[256];
+
   char *jit_build_options = NULL;
 
-  if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
   {
-    u32 native_threads = 0;
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
+  }
+  else
+  {
+    u32 overhead = 0;
 
-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
     {
-      native_threads = 1;
-    }
-    else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      #if defined (__APPLE__)
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.
 
-      native_threads = 32;
-
-      #else
-
-      if (device_param->device_local_mem_size < 49152)
+      if (device_param->is_opencl == true)
       {
-        native_threads = MIN (device_param->kernel_preferred_wgs_multiple, 32); // We can't just set 32, because Intel GPU need 8
+        overhead = 1;
       }
-      else
-      {
-        // to go over 48KiB, we need to use dynamic shared mem
-        native_threads = 49152 / 128;
-      }
-
-      #endif
     }
 
-    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", native_threads);
+    const u32 device_local_mem_size = MIN (device_param->device_local_mem_size, 48*1024);
+
+    u32 fixed_local_size = ((device_local_mem_size - overhead) - shared_size_aes) / shared_size_scratch;
+
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+    }
+    else
+    {
+      if (fixed_local_size > device_param->kernel_preferred_wgs_multiple) fixed_local_size -= fixed_local_size % device_param->kernel_preferred_wgs_multiple;
+    }
+
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", fixed_local_size);
   }
 
   return jit_build_options;
diff --git a/src/modules/module_13711.c b/src/modules/module_13711.c
index ff18b6c52..9e72c4b0f 100644
--- a/src/modules/module_13711.c
+++ b/src/modules/module_13711.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13712.c b/src/modules/module_13712.c
index b39eecfe7..26ad43579 100644
--- a/src/modules/module_13712.c
+++ b/src/modules/module_13712.c
@@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13713.c b/src/modules/module_13713.c
index d3c8d5fa4..7c67d69e5 100644
--- a/src/modules/module_13713.c
+++ b/src/modules/module_13713.c
@@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13721.c b/src/modules/module_13721.c
index b6c937875..14a28165d 100644
--- a/src/modules/module_13721.c
+++ b/src/modules/module_13721.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -346,7 +353,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13722.c b/src/modules/module_13722.c
index 3c92b2091..7850433b3 100644
--- a/src/modules/module_13722.c
+++ b/src/modules/module_13722.c
@@ -148,6 +148,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -363,7 +370,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13723.c b/src/modules/module_13723.c
index dd8f513f7..fae3d81ec 100644
--- a/src/modules/module_13723.c
+++ b/src/modules/module_13723.c
@@ -148,6 +148,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -363,7 +370,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13731.c b/src/modules/module_13731.c
index 27f0e5033..a3f175fd8 100644
--- a/src/modules/module_13731.c
+++ b/src/modules/module_13731.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13732.c b/src/modules/module_13732.c
index e78dbaa83..4ac4cc8e9 100644
--- a/src/modules/module_13732.c
+++ b/src/modules/module_13732.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13733.c b/src/modules/module_13733.c
index 1d7c36628..29da2b0ca 100644
--- a/src/modules/module_13733.c
+++ b/src/modules/module_13733.c
@@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13751.c b/src/modules/module_13751.c
index dac31a633..d2d0a50c2 100644
--- a/src/modules/module_13751.c
+++ b/src/modules/module_13751.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit (legacy)";
 static const u64   KERN_TYPE      = 13751;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
@@ -165,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -380,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13752.c b/src/modules/module_13752.c
index e959a80af..bf51b8d7b 100644
--- a/src/modules/module_13752.c
+++ b/src/modules/module_13752.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit (legacy)";
 static const u64   KERN_TYPE      = 13752;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
@@ -165,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -380,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13753.c b/src/modules/module_13753.c
index 277afaf02..fadb4ffed 100644
--- a/src/modules/module_13753.c
+++ b/src/modules/module_13753.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit (legacy)";
 static const u64   KERN_TYPE      = 13753;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
@@ -165,6 +166,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -380,7 +388,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13761.c b/src/modules/module_13761.c
index e33a693e2..5b1b82b27 100644
--- a/src/modules/module_13761.c
+++ b/src/modules/module_13761.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13751;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13762.c b/src/modules/module_13762.c
index 0f5f18545..6f1a27929 100644
--- a/src/modules/module_13762.c
+++ b/src/modules/module_13762.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13752;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13763.c b/src/modules/module_13763.c
index c9e28d4c7..fb50002a2 100644
--- a/src/modules/module_13763.c
+++ b/src/modules/module_13763.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit + boot-mode (legacy)";
 static const u64   KERN_TYPE      = 13753;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_BINARY_HASHFILE
diff --git a/src/modules/module_13771.c b/src/modules/module_13771.c
index b753d171f..32951f7c2 100644
--- a/src/modules/module_13771.c
+++ b/src/modules/module_13771.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13772.c b/src/modules/module_13772.c
index 6494ee637..e6af92bfb 100644
--- a/src/modules/module_13772.c
+++ b/src/modules/module_13772.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_13773.c b/src/modules/module_13773.c
index 6f2740347..32b6ddfb1 100644
--- a/src/modules/module_13773.c
+++ b/src/modules/module_13773.c
@@ -150,6 +150,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -365,7 +372,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_14800.c b/src/modules/module_14800.c
index 9f3c1ca0f..1e4b91a15 100644
--- a/src/modules/module_14800.c
+++ b/src/modules/module_14800.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "iTunes backup >= 10.0";
 static const u64   KERN_TYPE      = 14800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_14900.c b/src/modules/module_14900.c
index cbe009024..fcc639688 100644
--- a/src/modules/module_14900.c
+++ b/src/modules/module_14900.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_CIPHER_KPA;
 static const char *HASH_NAME      = "Skip32 (PT = $salt, key = $pass)";
 static const u64   KERN_TYPE      = 14900;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_SUGGEST_KG;
diff --git a/src/modules/module_15600.c b/src/modules/module_15600.c
index 1f2b7b9a3..66469d9bf 100644
--- a/src/modules/module_15600.c
+++ b/src/modules/module_15600.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Ethereum Wallet, PBKDF2-HMAC-SHA256";
 static const u64   KERN_TYPE      = 15600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_16300.c b/src/modules/module_16300.c
index e7507f1fa..6a12a1b39 100644
--- a/src/modules/module_16300.c
+++ b/src/modules/module_16300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Ethereum Pre-Sale Wallet, PBKDF2-HMAC-SHA256";
 static const u64   KERN_TYPE      = 16300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_16700.c b/src/modules/module_16700.c
index 9dce98e62..2fec2f557 100644
--- a/src/modules/module_16700.c
+++ b/src/modules/module_16700.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "FileVault 2";
 static const u64   KERN_TYPE      = 16200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_16900.c b/src/modules/module_16900.c
index 443da6007..70eca3f16 100644
--- a/src/modules/module_16900.c
+++ b/src/modules/module_16900.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Ansible Vault";
 static const u64   KERN_TYPE      = 16900;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_18100.c b/src/modules/module_18100.c
index 46dadfe60..678cb8265 100644
--- a/src/modules/module_18100.c
+++ b/src/modules/module_18100.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OTP;
 static const char *HASH_NAME      = "TOTP (HMAC-SHA1)";
 static const u64   KERN_TYPE      = 18100;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE
diff --git a/src/modules/module_20800.c b/src/modules/module_20800.c
index 6cece615d..1330b99bc 100644
--- a/src/modules/module_20800.c
+++ b/src/modules/module_20800.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "sha256(md5($pass))";
 static const u64   KERN_TYPE      = 20800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_21300.c b/src/modules/module_21300.c
index 75e57c1f4..34ebff7f5 100644
--- a/src/modules/module_21300.c
+++ b/src/modules/module_21300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "md5($salt.sha1($salt.$pass))";
 static const u64   KERN_TYPE      = 21300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_21500.c b/src/modules/module_21500.c
index dba580a7c..8936be11c 100644
--- a/src/modules/module_21500.c
+++ b/src/modules/module_21500.c
@@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SolarWinds Orion";
 static const u64   KERN_TYPE      = 21500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                   | OPTI_TYPE_USES_BITS_64
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_21700.c b/src/modules/module_21700.c
index 65536027b..e00cf232f 100644
--- a/src/modules/module_21700.c
+++ b/src/modules/module_21700.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Electrum Wallet (Salt-Type 4)";
 static const u64   KERN_TYPE      = 21700;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_22100.c b/src/modules/module_22100.c
index 344218243..4d79d30bf 100644
--- a/src/modules/module_22100.c
+++ b/src/modules/module_22100.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "BitLocker";
 static const u64   KERN_TYPE      = 22100;
-static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_MP_MULTI_DISABLE;
diff --git a/src/modules/module_22300.c b/src/modules/module_22300.c
index a85d1d753..04a4bc08c 100644
--- a/src/modules/module_22300.c
+++ b/src/modules/module_22300.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "sha256($salt.$pass.$salt)";
 static const u64   KERN_TYPE      = 22300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_PRECOMPUTE_INIT
                                   | OPTI_TYPE_EARLY_SKIP
                                   | OPTI_TYPE_NOT_ITERATED
diff --git a/src/modules/module_22921.c b/src/modules/module_22921.c
index 163aacee9..c6471a232 100644
--- a/src/modules/module_22921.c
+++ b/src/modules/module_22921.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "RSA/DSA/EC/OpenSSH Private Keys ($6$)";
 static const u64   KERN_TYPE      = 22921;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/modules/module_22941.c b/src/modules/module_22941.c
index 4b4bf09ac..65e7d7c2b 100644
--- a/src/modules/module_22941.c
+++ b/src/modules/module_22941.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "RSA/DSA/EC/OpenSSH Private Keys ($4$)";
 static const u64   KERN_TYPE      = 22941;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
 static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
diff --git a/src/modules/module_23400.c b/src/modules/module_23400.c
index 13743815f..c2cc2a2e2 100644
--- a/src/modules/module_23400.c
+++ b/src/modules/module_23400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Bitwarden";
 static const u64   KERN_TYPE      = 23400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_23600.c b/src/modules/module_23600.c
index fdb3467d1..0b8994ff7 100644
--- a/src/modules/module_23600.c
+++ b/src/modules/module_23600.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "AxCrypt 2 AES-256";
 static const u64   KERN_TYPE      = 23600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_23800.c b/src/modules/module_23800.c
index c6c8fec61..c75a9aaaf 100644
--- a/src/modules/module_23800.c
+++ b/src/modules/module_23800.c
@@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4; // actually only DGST_SIZE_4_
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_ARCHIVE;
 static const char *HASH_NAME      = "RAR3-p (Compressed)";
 static const u64   KERN_TYPE      = 23800;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_HOOK23
diff --git a/src/modules/module_24200.c b/src/modules/module_24200.c
index 8b6a8e5c8..20ea6ccec 100644
--- a/src/modules/module_24200.c
+++ b/src/modules/module_24200.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_DATABASE_SERVER;
 static const char *HASH_NAME      = "MongoDB ServerKey SCRAM-SHA-256";
 static const u64   KERN_TYPE      = 24200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_24420.c b/src/modules/module_24420.c
index 39820834a..544e07ee1 100644
--- a/src/modules/module_24420.c
+++ b/src/modules/module_24420.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PRIVATE_KEY;
 static const char *HASH_NAME      = "PKCS#8 Private Keys (PBKDF2-HMAC-SHA256 + 3DES/AES)";
 static const u64   KERN_TYPE      = 24420;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_25500.c b/src/modules/module_25500.c
index afc451fb9..06a1e795d 100644
--- a/src/modules/module_25500.c
+++ b/src/modules/module_25500.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "Stargazer Stellar Wallet XLM";
 static const u64   KERN_TYPE      = 25500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_25900.c b/src/modules/module_25900.c
index 7ad951f6d..a460a3bd8 100644
--- a/src/modules/module_25900.c
+++ b/src/modules/module_25900.c
@@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "KNX IP Secure - Device Authentication Code";
 static const u64   KERN_TYPE      = 25900;
-static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+static const u32   OPTI_TYPE      = OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_DEEP_COMP_KERNEL;
diff --git a/src/modules/module_26000.c b/src/modules/module_26000.c
index 35a6dd937..8ed90d5ee 100644
--- a/src/modules/module_26000.c
+++ b/src/modules/module_26000.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Mozilla key3.db";
 static const u64   KERN_TYPE      = 26000;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_BE;
diff --git a/src/modules/module_26100.c b/src/modules/module_26100.c
index c42958289..3414fcbdb 100644
--- a/src/modules/module_26100.c
+++ b/src/modules/module_26100.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "Mozilla key4.db";
 static const u64   KERN_TYPE      = 26100;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26600.c b/src/modules/module_26600.c
index 926555e94..92b5c8efc 100644
--- a/src/modules/module_26600.c
+++ b/src/modules/module_26600.c
@@ -21,6 +21,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_CRYPTOCURRENCY_WALLET;
 static const char *HASH_NAME      = "MetaMask Wallet (needs all data, checks AES-GCM tag)";
 static const u64   KERN_TYPE      = 26600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26700.c b/src/modules/module_26700.c
index 63854c0d2..47427ba32 100644
--- a/src/modules/module_26700.c
+++ b/src/modules/module_26700.c
@@ -21,7 +21,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA224-128";
 static const u64   KERN_TYPE      = 26700;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26800.c b/src/modules/module_26800.c
index 024bfa636..44d72f7df 100644
--- a/src/modules/module_26800.c
+++ b/src/modules/module_26800.c
@@ -21,7 +21,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_6;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA256-192";
 static const u64   KERN_TYPE      = 26800;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
                                   | OPTS_TYPE_PT_GENERATE_LE;
diff --git a/src/modules/module_26900.c b/src/modules/module_26900.c
index 8d916ebf2..29e3cdc92 100644
--- a/src/modules/module_26900.c
+++ b/src/modules/module_26900.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA384-256";
 static const u64   KERN_TYPE      = 26900;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
diff --git a/src/modules/module_27300.c b/src/modules/module_27300.c
index ff20acc9d..4d607051e 100644
--- a/src/modules/module_27300.c
+++ b/src/modules/module_27300.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_PROTOCOL;
 static const char *HASH_NAME      = "SNMPv3 HMAC-SHA512-384";
 static const u64   KERN_TYPE      = 27300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_NATIVE_THREADS
diff --git a/src/modules/module_27400.c b/src/modules/module_27400.c
index 7d9f0fbd4..059b50cb5 100644
--- a/src/modules/module_27400.c
+++ b/src/modules/module_27400.c
@@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VMware VMX (PBKDF2-HMAC-SHA1 + AES-256-CBC)";
 static const u64   KERN_TYPE      = 27400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
diff --git a/src/modules/module_27500.c b/src/modules/module_27500.c
index ebabe24e5..6b79eba3a 100644
--- a/src/modules/module_27500.c
+++ b/src/modules/module_27500.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VirtualBox (PBKDF2-HMAC-SHA256 & AES-128-XTS)";
 static const u64   KERN_TYPE      = 27500;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_27600.c b/src/modules/module_27600.c
index b4cdade54..3259ce05f 100644
--- a/src/modules/module_27600.c
+++ b/src/modules/module_27600.c
@@ -22,6 +22,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VirtualBox (PBKDF2-HMAC-SHA256 & AES-256-XTS)";
 static const u64   KERN_TYPE      = 27600;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
                                   | OPTI_TYPE_SLOW_HASH_SIMD_LOOP2;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
diff --git a/src/modules/module_29411.c b/src/modules/module_29411.c
index a3e7160eb..60fa66f6f 100644
--- a/src/modules/module_29411.c
+++ b/src/modules/module_29411.c
@@ -114,6 +114,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -356,7 +363,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29412.c b/src/modules/module_29412.c
index 969e6d355..df64b2a6c 100644
--- a/src/modules/module_29412.c
+++ b/src/modules/module_29412.c
@@ -114,6 +114,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -356,7 +363,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29413.c b/src/modules/module_29413.c
index 935d16dc5..7f455767d 100644
--- a/src/modules/module_29413.c
+++ b/src/modules/module_29413.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29421.c b/src/modules/module_29421.c
index 4c0e94163..69e4ba2d5 100644
--- a/src/modules/module_29421.c
+++ b/src/modules/module_29421.c
@@ -130,6 +130,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -372,7 +379,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29422.c b/src/modules/module_29422.c
index d32d1471e..f3b4cc98c 100644
--- a/src/modules/module_29422.c
+++ b/src/modules/module_29422.c
@@ -141,6 +141,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -383,7 +390,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29423.c b/src/modules/module_29423.c
index 988bc8a42..a038dd7a4 100644
--- a/src/modules/module_29423.c
+++ b/src/modules/module_29423.c
@@ -141,6 +141,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -383,7 +390,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29431.c b/src/modules/module_29431.c
index a22c24e3d..8555b19b7 100644
--- a/src/modules/module_29431.c
+++ b/src/modules/module_29431.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29432.c b/src/modules/module_29432.c
index 563b254c0..b9c54a1ca 100644
--- a/src/modules/module_29432.c
+++ b/src/modules/module_29432.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29433.c b/src/modules/module_29433.c
index 90e2033c9..519ff8fba 100644
--- a/src/modules/module_29433.c
+++ b/src/modules/module_29433.c
@@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -373,7 +380,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29451.c b/src/modules/module_29451.c
index c8e10bdcf..94d902273 100644
--- a/src/modules/module_29451.c
+++ b/src/modules/module_29451.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit";
 static const u64   KERN_TYPE      = 13751; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
@@ -150,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -392,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29452.c b/src/modules/module_29452.c
index 395839f64..1c5ac7f1b 100644
--- a/src/modules/module_29452.c
+++ b/src/modules/module_29452.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit";
 static const u64   KERN_TYPE      = 13752; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
@@ -150,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -392,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29453.c b/src/modules/module_29453.c
index 97bf9a333..74322f135 100644
--- a/src/modules/module_29453.c
+++ b/src/modules/module_29453.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit";
 static const u64   KERN_TYPE      = 13753; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
@@ -150,6 +151,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -392,7 +400,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29461.c b/src/modules/module_29461.c
index be7dcfc2e..ff2705da7 100644
--- a/src/modules/module_29461.c
+++ b/src/modules/module_29461.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 512 bit + boot-mode";
 static const u64   KERN_TYPE      = 13751; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29462.c b/src/modules/module_29462.c
index 7e9a6535c..14b53d6ad 100644
--- a/src/modules/module_29462.c
+++ b/src/modules/module_29462.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1024 bit + boot-mode";
 static const u64   KERN_TYPE      = 13752; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29463.c b/src/modules/module_29463.c
index e98c71289..7a5c8ac92 100644
--- a/src/modules/module_29463.c
+++ b/src/modules/module_29463.c
@@ -23,7 +23,8 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FDE;
 static const char *HASH_NAME      = "VeraCrypt SHA256 + XTS 1536 bit + boot-mode";
 static const u64   KERN_TYPE      = 13753; // old kernel used here
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
-                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
+                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                   | OPTS_TYPE_PT_GENERATE_LE
                                   | OPTS_TYPE_LOOP_EXTENDED
diff --git a/src/modules/module_29471.c b/src/modules/module_29471.c
index 18d50d294..a2bfe7996 100644
--- a/src/modules/module_29471.c
+++ b/src/modules/module_29471.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29472.c b/src/modules/module_29472.c
index a4c0c275e..9a1614135 100644
--- a/src/modules/module_29472.c
+++ b/src/modules/module_29472.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_29473.c b/src/modules/module_29473.c
index 744c1ac6d..b767dad92 100644
--- a/src/modules/module_29473.c
+++ b/src/modules/module_29473.c
@@ -143,6 +143,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
   return tmp_size;
 }
 
+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
   const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@@ -385,7 +392,7 @@ void module_init (module_ctx_t *module_ctx)
   module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
   module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
   module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
   module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
   module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
   module_ctx->module_kern_type                = module_kern_type;
diff --git a/src/modules/module_34000.c b/src/modules/module_34000.c
new file mode 100644
index 000000000..34b621133
--- /dev/null
+++ b/src/modules/module_34000.c
@@ -0,0 +1,296 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+#include "memory.h"
+
+static const u32   ATTACK_EXEC    = ATTACK_EXEC_OUTSIDE_KERNEL;
+static const u32   DGST_POS0      = 0;
+static const u32   DGST_POS1      = 1;
+static const u32   DGST_POS2      = 2;
+static const u32   DGST_POS3      = 3;
+static const u32   DGST_SIZE      = DGST_SIZE_8_16;
+static const u32   HASH_CATEGORY  = HASH_CATEGORY_GENERIC_KDF;
+static const char *HASH_NAME      = "Argon2";
+static const u64   KERN_TYPE      = 34000;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_SLOW_HASH_DIMY_LOOP;
+static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
+                                  | OPTS_TYPE_PT_GENERATE_LE
+                                  | OPTS_TYPE_THREAD_MULTI_DISABLE
+                                  | OPTS_TYPE_MP_MULTI_DISABLE;
+static const u32   SALT_TYPE      = SALT_TYPE_EMBEDDED;
+static const char *ST_PASS        = "hashcat";
+static const char *ST_HASH        = "$argon2id$v=19$m=65536,t=3,p=1$FBMjI4RJBhIykCgol1KEJA$2ky5GAdhT1kH4kIgPN/oERE3Taiy43vNN70a3HpiKQU";
+
+u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
+u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
+u32         module_dgst_pos1      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS1;       }
+u32         module_dgst_pos2      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS2;       }
+u32         module_dgst_pos3      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS3;       }
+u32         module_dgst_size      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_SIZE;       }
+u32         module_hash_category  (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_CATEGORY;   }
+const char *module_hash_name      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return HASH_NAME;       }
+u64         module_kern_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return KERN_TYPE;       }
+u32         module_opti_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTI_TYPE;       }
+u64         module_opts_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return OPTS_TYPE;       }
+u32         module_salt_type      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return SALT_TYPE;       }
+const char *module_st_hash        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_HASH;         }
+const char *module_st_pass        (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ST_PASS;         }
+
+#include "argon2_common.c"
+
+static const char *SIGNATURE_ARGON2D  = "$argon2d$";
+static const char *SIGNATURE_ARGON2I  = "$argon2i$";
+static const char *SIGNATURE_ARGON2ID = "$argon2id$";
+
+u64 module_esalt_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 esalt_size = (const u64) sizeof (argon2_options_t);
+
+  return esalt_size;
+}
+
+int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
+{
+  u32 *digest = (u32 *) digest_buf;
+
+  argon2_options_t *options  = (argon2_options_t *) esalt_buf;
+
+  hc_token_t token;
+
+  memset (&token, 0, sizeof (hc_token_t));
+
+  token.token_cnt  = 7;
+
+  token.signatures_cnt    = 3;
+  token.signatures_buf[0] = SIGNATURE_ARGON2D;
+  token.signatures_buf[1] = SIGNATURE_ARGON2I;
+  token.signatures_buf[2] = SIGNATURE_ARGON2ID;
+
+  token.len_min[0] = 9;
+  token.len_max[0] = 10;
+  token.sep[0]     = 0;
+  token.attr[0]    = TOKEN_ATTR_VERIFY_SIGNATURE;
+
+  // version
+  token.len[1]     = 4;
+  token.sep[1]     = '$';
+  token.attr[1]    = TOKEN_ATTR_FIXED_LENGTH;
+
+  // memoryUsageInKib
+  token.len_min[2] = 3;
+  token.len_max[2] = 12;
+  token.sep[2]     = ',';
+  token.attr[2]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // iterations
+  token.len_min[3] = 3;
+  token.len_max[3] = 5;
+  token.sep[3]     = ',';
+  token.attr[3]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // parallelism
+  token.len_min[4] = 3;
+  token.len_max[4] = 5;
+  token.sep[4]     = '$';
+  token.attr[4]    = TOKEN_ATTR_VERIFY_LENGTH;
+
+  // salt
+  token.len_min[5] = ((SALT_MIN * 8) / 6) + 0;
+  token.len_max[5] = ((SALT_MAX * 8) / 6) + 3;
+  token.sep[5]     = '$';
+  token.attr[5]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_BASE64A;
+
+  // target hash
+  token.len_min[6] = ((  1 * 8) / 6) + 0;
+  token.len_max[6] = ((128 * 8) / 6) + 3;
+  token.sep[6]     = '$';
+  token.attr[6]    = TOKEN_ATTR_VERIFY_LENGTH
+                   | TOKEN_ATTR_VERIFY_BASE64A;
+
+  const int rc_tokenizer = input_tokenizer ((const u8 *) line_buf, line_len, &token);
+
+  if (rc_tokenizer != PARSER_OK) return (rc_tokenizer);
+
+  // signature sets argon2 typ
+
+  const int sig_len = token.len[0];
+  const u8 *sig_pos = token.buf[0];
+
+  if      (memcmp (SIGNATURE_ARGON2D,  sig_pos, sig_len) == 0) options->type = 0;
+  else if (memcmp (SIGNATURE_ARGON2I,  sig_pos, sig_len) == 0) options->type = 1;
+  else if (memcmp (SIGNATURE_ARGON2ID, sig_pos, sig_len) == 0) options->type = 2;
+  else
+    return (PARSER_SIGNATURE_UNMATCHED);
+
+  // argon2id config
+  const u8 *ver_pos = token.buf[1];
+  const u8 *mem_pos = token.buf[2];
+  const u8 *it_pos  = token.buf[3];
+  const u8 *par_pos = token.buf[4];
+
+  options->version             = hc_strtoul ((const char *) ver_pos + 2, NULL, 10);
+  options->memory_usage_in_kib = hc_strtoul ((const char *) mem_pos + 2, NULL, 10);
+  options->iterations          = hc_strtoul ((const char *) it_pos  + 2, NULL, 10);
+  options->parallelism         = hc_strtoul ((const char *) par_pos + 2, NULL, 10);
+
+  if (options->version != 19 && options->version != 16) return (PARSER_HASH_VALUE);
+  if (options->memory_usage_in_kib < 1) return (PARSER_HASH_VALUE);
+  if (options->iterations < 1) return (PARSER_HASH_VALUE);
+  if (options->parallelism < 1 || options->parallelism > 32) return (PARSER_HASH_VALUE);
+
+  options->segment_length     = MAX (2, (options->memory_usage_in_kib / (ARGON2_SYNC_POINTS * options->parallelism)));
+  options->lane_length        = options->segment_length * ARGON2_SYNC_POINTS;
+  options->memory_block_count = options->lane_length * options->parallelism;
+
+  // salt
+  const int salt_len = token.len[5];
+  const u8 *salt_pos = token.buf[5];
+
+  salt->salt_iter = options->iterations * ARGON2_SYNC_POINTS;
+  salt->salt_dimy = options->parallelism;
+  salt->salt_len = base64_decode (base64_to_int, (const u8 *) salt_pos, salt_len, (u8 *) salt->salt_buf);
+
+  // digest/ target hash
+  const int digest_len = token.len[6];
+  const u8 *digest_pos = token.buf[6];
+
+  options->digest_len = base64_decode (base64_to_int, (const u8 *) digest_pos, digest_len, (u8 *) digest);
+
+  return (PARSER_OK);
+}
+
+int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const void *digest_buf, MAYBE_UNUSED const salt_t *salt, MAYBE_UNUSED const void *esalt_buf, MAYBE_UNUSED const void *hook_salt_buf, MAYBE_UNUSED const hashinfo_t *hash_info, char *line_buf, MAYBE_UNUSED const int line_size)
+{
+  u32 *digest = (u32 *) digest_buf;
+
+  argon2_options_t *options  = (argon2_options_t *) esalt_buf;
+
+  // salt
+  char base64_salt[512] = { 0 };
+  int len1 = base64_encode (int_to_base64, (const u8 *) salt->salt_buf, salt->salt_len, (u8 *) base64_salt);
+
+  for (int i = len1 - 1; i >=0; i--) if (base64_salt[i] == '=') base64_salt[i] = 0;
+
+  // digest
+  char base64_digest[512] = { 0 };
+  int len2 = base64_encode (int_to_base64, (const u8 *) digest, options->digest_len, (u8 *) base64_digest);
+
+  for (int i = len2 - 1; i >=0; i--) if (base64_digest[i] == '=') base64_digest[i] = 0;
+
+  // out
+
+  const char *signature = NULL;
+
+  switch (options->type)
+  {
+    case 0: signature = SIGNATURE_ARGON2D;  break;
+    case 1: signature = SIGNATURE_ARGON2I;  break;
+    case 2: signature = SIGNATURE_ARGON2ID; break;
+  }
+
+  u8 *out_buf = (u8 *) line_buf;
+
+  const int out_len = snprintf ((char *) out_buf, line_size, "%sv=%d$m=%d,t=%d,p=%d$%s$%s",
+    signature,
+    options->version,
+    options->memory_usage_in_kib,
+    options->iterations,
+    options->parallelism,
+    base64_salt,
+    base64_digest);
+
+  return out_len;
+}
+
+void module_init (module_ctx_t *module_ctx)
+{
+  module_ctx->module_context_size             = MODULE_CONTEXT_SIZE_CURRENT;
+  module_ctx->module_interface_version        = MODULE_INTERFACE_VERSION_CURRENT;
+
+  module_ctx->module_attack_exec              = module_attack_exec;
+  module_ctx->module_benchmark_esalt          = MODULE_DEFAULT;
+  module_ctx->module_benchmark_hook_salt      = MODULE_DEFAULT;
+  module_ctx->module_benchmark_mask           = MODULE_DEFAULT;
+  module_ctx->module_benchmark_charset        = MODULE_DEFAULT;
+  module_ctx->module_benchmark_salt           = MODULE_DEFAULT;
+  module_ctx->module_bridge_name              = MODULE_DEFAULT;
+  module_ctx->module_bridge_type              = MODULE_DEFAULT;
+  module_ctx->module_build_plain_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_deep_comp_kernel         = MODULE_DEFAULT;
+  module_ctx->module_deprecated_notice        = MODULE_DEFAULT;
+  module_ctx->module_dgst_pos0                = module_dgst_pos0;
+  module_ctx->module_dgst_pos1                = module_dgst_pos1;
+  module_ctx->module_dgst_pos2                = module_dgst_pos2;
+  module_ctx->module_dgst_pos3                = module_dgst_pos3;
+  module_ctx->module_dgst_size                = module_dgst_size;
+  module_ctx->module_dictstat_disable         = MODULE_DEFAULT;
+  module_ctx->module_esalt_size               = module_esalt_size;
+  module_ctx->module_extra_buffer_size        = argon2_module_extra_buffer_size;
+  module_ctx->module_extra_tmp_size           = argon2_module_extra_tmp_size;
+  module_ctx->module_extra_tuningdb_block     = argon2_module_extra_tuningdb_block;
+  module_ctx->module_forced_outfile_format    = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_count        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_parse        = MODULE_DEFAULT;
+  module_ctx->module_hash_binary_save         = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_postprocess  = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_decode_zero_hash    = MODULE_DEFAULT;
+  module_ctx->module_hash_decode              = module_hash_decode;
+  module_ctx->module_hash_encode_status       = MODULE_DEFAULT;
+  module_ctx->module_hash_encode_potfile      = MODULE_DEFAULT;
+  module_ctx->module_hash_encode              = module_hash_encode;
+  module_ctx->module_hash_init_selftest       = MODULE_DEFAULT;
+  module_ctx->module_hash_mode                = MODULE_DEFAULT;
+  module_ctx->module_hash_category            = module_hash_category;
+  module_ctx->module_hash_name                = module_hash_name;
+  module_ctx->module_hashes_count_min         = MODULE_DEFAULT;
+  module_ctx->module_hashes_count_max         = MODULE_DEFAULT;
+  module_ctx->module_hlfmt_disable            = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_size    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_init    = MODULE_DEFAULT;
+  module_ctx->module_hook_extra_param_term    = MODULE_DEFAULT;
+  module_ctx->module_hook12                   = MODULE_DEFAULT;
+  module_ctx->module_hook23                   = MODULE_DEFAULT;
+  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
+  module_ctx->module_hook_size                = MODULE_DEFAULT;
+  module_ctx->module_jit_build_options        = argon2_module_jit_build_options;
+  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_max         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_threads_max       = argon2_module_kernel_threads_max;
+  module_ctx->module_kernel_threads_min       = argon2_module_kernel_threads_min;
+  module_ctx->module_kern_type                = module_kern_type;
+  module_ctx->module_kern_type_dynamic        = MODULE_DEFAULT;
+  module_ctx->module_opti_type                = module_opti_type;
+  module_ctx->module_opts_type                = module_opts_type;
+  module_ctx->module_outfile_check_disable    = MODULE_DEFAULT;
+  module_ctx->module_outfile_check_nocomp     = MODULE_DEFAULT;
+  module_ctx->module_potfile_custom_check     = MODULE_DEFAULT;
+  module_ctx->module_potfile_disable          = MODULE_DEFAULT;
+  module_ctx->module_potfile_keep_all_hashes  = MODULE_DEFAULT;
+  module_ctx->module_pwdump_column            = MODULE_DEFAULT;
+  module_ctx->module_pw_max                   = MODULE_DEFAULT;
+  module_ctx->module_pw_min                   = MODULE_DEFAULT;
+  module_ctx->module_salt_max                 = MODULE_DEFAULT;
+  module_ctx->module_salt_min                 = MODULE_DEFAULT;
+  module_ctx->module_salt_type                = module_salt_type;
+  module_ctx->module_separator                = MODULE_DEFAULT;
+  module_ctx->module_st_hash                  = module_st_hash;
+  module_ctx->module_st_pass                  = module_st_pass;
+  module_ctx->module_tmp_size                 = argon2_module_tmp_size;
+  module_ctx->module_unstable_warning         = MODULE_DEFAULT;
+  module_ctx->module_warmup_disable           = MODULE_DEFAULT;
+}
+
diff --git a/src/modules/module_70000.c b/src/modules/module_70000.c
index 01504ea09..57bd77d12 100644
--- a/src/modules/module_70000.c
+++ b/src/modules/module_70000.c
@@ -29,7 +29,7 @@ static const u64   BRIDGE_TYPE    = BRIDGE_TYPE_MATCH_TUNINGS // optional - impr
                                   | BRIDGE_TYPE_REPLACE_LOOP;
 static const char *BRIDGE_NAME    = "argon2id_reference";
 static const char *ST_PASS        = "hashcat";
-static const char *ST_HASH        = "$argon2id$v=19$m=4096,t=3,p=1$FoIjFnZlM2JSJWYXUgMFAw$eYKMzhbW8uyT1LLtKRdRcJj2CQeRrdr2pKv/Y71YbAQ";
+static const char *ST_HASH        = "$argon2id$v=19$m=65536,t=3,p=1$FBMjI4RJBhIykCgol1KEJA$2ky5GAdhT1kH4kIgPN/oERE3Taiy43vNN70a3HpiKQU";
 
 u32         module_attack_exec    (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return ATTACK_EXEC;     }
 u32         module_dgst_pos0      (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { return DGST_POS0;       }
diff --git a/src/mpsp.c b/src/mpsp.c
index 3efe345d7..0d2ae9ae0 100644
--- a/src/mpsp.c
+++ b/src/mpsp.c
@@ -1403,8 +1403,8 @@ int mask_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
diff --git a/src/outfile_check.c b/src/outfile_check.c
index e681b7263..afee8ddb8 100644
--- a/src/outfile_check.c
+++ b/src/outfile_check.c
@@ -343,13 +343,13 @@ int outcheck_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   outcheck_ctx->enabled = false;
 
+  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;
   if (user_options->keyspace      == true) return 0;
   if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
   if (user_options->speed_only    == true) return 0;
   if (user_options->progress_only == true) return 0;
   if (user_options->identify      == true) return 0;
-  if (user_options->backend_info   > 0)    return 0;
 
   if (hashconfig->outfile_check_disable == true) return 0;
   if (user_options->outfile_check_timer == 0)    return 0;
diff --git a/src/potfile.c b/src/potfile.c
index afafca2f1..9a87ed027 100644
--- a/src/potfile.c
+++ b/src/potfile.c
@@ -115,9 +115,9 @@ int potfile_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage            > 0)     return 0;
   if (user_options->backend_info     > 0)     return 0;
+  if (user_options->hash_info        > 0)     return 0;
 
   if (user_options->benchmark       == true)  return 0;
-  if (user_options->hash_info       == true)  return 0;
   if (user_options->keyspace        == true)  return 0;
   if (user_options->stdout_flag     == true)  return 0;
   if (user_options->speed_only      == true)  return 0;
diff --git a/src/restore.c b/src/restore.c
index f2a59f7c1..e5c141bfc 100644
--- a/src/restore.c
+++ b/src/restore.c
@@ -89,6 +89,14 @@ static int read_restore (hashcat_ctx_t *hashcat_ctx)
 
   char *buf = (char *) hcmalloc (HCBUFSIZ_LARGE);
 
+  if (buf == NULL)
+  {
+    event_log_error(hashcat_ctx, "hcmalloc: %s", strerror(errno));
+    hc_fclose(&fp);
+    hcfree(rd->argv);
+    return -1;
+  }
+
   for (u32 i = 0; i < rd->argc; i++)
   {
     if (hc_fgets (buf, HCBUFSIZ_LARGE - 1, &fp) == NULL)
@@ -304,9 +312,9 @@ int restore_ctx_init (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
 
   if (user_options->usage            > 0)     return 0;
   if (user_options->backend_info     > 0)     return 0;
+  if (user_options->hash_info        > 0)     return 0;
 
   if (user_options->benchmark       == true)  return 0;
-  if (user_options->hash_info       == true)  return 0;
   if (user_options->keyspace        == true)  return 0;
   if (user_options->left            == true)  return 0;
   if (user_options->show            == true)  return 0;
diff --git a/src/selftest.c b/src/selftest.c
index f1b2b2dd5..6c63a3418 100644
--- a/src/selftest.c
+++ b/src/selftest.c
@@ -12,18 +12,14 @@
 #include "thread.h"
 #include "selftest.h"
 
-static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+static int selftest_init (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 *highest_pw_len)
 {
-  bridge_ctx_t         *bridge_ctx         = hashcat_ctx->bridge_ctx;
-  hashconfig_t         *hashconfig         = hashcat_ctx->hashconfig;
   hashes_t             *hashes             = hashcat_ctx->hashes;
   module_ctx_t         *module_ctx         = hashcat_ctx->module_ctx;
-  status_ctx_t         *status_ctx         = hashcat_ctx->status_ctx;
+  hashconfig_t         *hashconfig         = hashcat_ctx->hashconfig;
   user_options_t       *user_options       = hashcat_ctx->user_options;
   user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
 
-  if (hashconfig->st_hash == NULL) return 0;
-
   // init : replace hashes with selftest hash
 
   if (device_param->is_cuda == true)
@@ -82,12 +78,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     tmp.pw_len = (u32) tmp_len;
   }
 
+  cl_event opencl_event;
+
   pw_t pw;
   pw_t comb;
   bf_t bf;
 
-  u32 highest_pw_len = 0;
-
   if (user_options->slow_candidates == true)
   {
     if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
@@ -107,12 +103,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     if (device_param->is_cuda == true)
     {
-      if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+      if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
     }
 
     if (device_param->is_hip == true)
     {
-      if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+      if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
     }
 
     #if defined (__APPLE__)
@@ -124,7 +120,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
     if (device_param->is_opencl == true)
     {
-      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+      if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
     }
   }
   else
@@ -152,12 +148,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -169,7 +165,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_COMBI)
@@ -222,16 +218,16 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_combs_c, &comb, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_combs_c, &comb, 1 * sizeof (pw_t)) == -1) return -1;
 
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_combs_c, &comb, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_combs_c, &comb, 1 * sizeof (pw_t)) == -1) return -1;
 
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -245,9 +241,9 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_FALSE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_combs_c, CL_TRUE, 0, 1 * sizeof (pw_t), &comb, 0, NULL, &opencl_event) == -1) return -1;
 
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
         }
       }
       else if (user_options_extra->attack_kern == ATTACK_KERN_BF)
@@ -273,12 +269,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -290,7 +286,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
         }
         else
@@ -334,12 +330,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_bfs_c, &bf, 1 * sizeof (bf_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_bfs_c, &bf, 1 * sizeof (bf_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_bfs_c, &bf, 1 * sizeof (bf_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_bfs_c, &bf, 1 * sizeof (bf_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -351,7 +347,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_FALSE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_bfs_c, CL_TRUE, 0, 1 * sizeof (bf_t), &bf, 0, NULL, &opencl_event) == -1) return -1;
           }
 
           memset (&pw, 0, sizeof (pw));
@@ -440,12 +436,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_cuda == true)
           {
-            if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           if (device_param->is_hip == true)
           {
-            if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
           }
 
           #if defined (__APPLE__)
@@ -457,10 +453,10 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
           if (device_param->is_opencl == true)
           {
-            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+            if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
           }
 
-          highest_pw_len = pw.pw_len;
+          *highest_pw_len = pw.pw_len;
         }
       }
     }
@@ -478,12 +474,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t), device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_pws_buf, &pw, 1 * sizeof (pw_t)) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -495,11 +491,30 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_FALSE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_pws_buf, CL_TRUE, 0, 1 * sizeof (pw_t), &pw, 0, NULL, &opencl_event) == -1) return -1;
       }
     }
   }
 
+  // prevent race condition on OpenCL with non-blocking write
+
+  if (device_param->is_opencl == true)
+  {
+    if (hc_clWaitForEvents (hashcat_ctx, 1, &opencl_event) == -1) return -1;
+
+    if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
+  }
+
+  return 0;
+}
+
+static int selftest_run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 highest_pw_len)
+{
+  bridge_ctx_t *bridge_ctx = hashcat_ctx->bridge_ctx;
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+  hashes_t     *hashes     = hashcat_ctx->hashes;
+  module_ctx_t *module_ctx = hashcat_ctx->module_ctx;
+
   // main : run the kernel
 
   const u32 kernel_threads_sav = device_param->kernel_threads;
@@ -572,14 +587,14 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks) == -1) return -1;
 
         if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks) == -1) return -1;
 
         if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
       }
@@ -601,12 +616,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (device_param->is_cuda == true)
       {
-        if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+        if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
       }
 
       if (device_param->is_hip == true)
       {
-        if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+        if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
       }
 
       #if defined (__APPLE__)
@@ -618,7 +633,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
       if (device_param->is_opencl == true)
       {
-        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_FALSE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+        if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
       }
     }
 
@@ -668,14 +683,14 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -709,14 +724,14 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
           {
             if (device_param->is_cuda == true)
             {
-              if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+              if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
             }
 
             if (device_param->is_hip == true)
             {
-              if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+              if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
 
               if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
             }
@@ -743,14 +758,14 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->cuda_d_hooks, device_param->size_hooks) == -1) return -1;
 
           if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->hooks_buf, device_param->hip_d_hooks, device_param->size_hooks) == -1) return -1;
 
           if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
         }
@@ -772,12 +787,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_cuda == true)
         {
-          if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->cuda_stream) == -1) return -1;
+          if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
         }
 
         if (device_param->is_hip == true)
         {
-          if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks, device_param->hip_stream) == -1) return -1;
+          if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_hooks, device_param->hooks_buf, device_param->size_hooks) == -1) return -1;
         }
 
         #if defined (__APPLE__)
@@ -789,7 +804,7 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
         if (device_param->is_opencl == true)
         {
-          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_FALSE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
+          if (hc_clEnqueueWriteBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_hooks, CL_TRUE, 0, device_param->size_hooks, device_param->hooks_buf, 0, NULL, NULL) == -1) return -1;
         }
       }
     }
@@ -831,14 +846,14 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
             {
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->cuda_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
                 if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyDtoHAsync (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyDtoH (hashcat_ctx, device_param->h_tmps, device_param->hip_d_tmps, hashconfig->tmp_size) == -1) return -1;
 
                 if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
               }
@@ -872,12 +887,12 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
             {
               if (device_param->is_cuda == true)
               {
-                if (hc_cuMemcpyHtoDAsync (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->cuda_stream) == -1) return -1;
+                if (hc_cuMemcpyHtoD (hashcat_ctx, device_param->cuda_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
               }
 
               if (device_param->is_hip == true)
               {
-                if (hc_hipMemcpyHtoDAsync (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size, device_param->hip_stream) == -1) return -1;
+                if (hc_hipMemcpyHtoD (hashcat_ctx, device_param->hip_d_tmps, device_param->h_tmps, hashconfig->tmp_size) == -1) return -1;
               }
 
               #if defined (__APPLE__)
@@ -933,22 +948,28 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
 
   device_param->kernel_threads = kernel_threads_sav;
 
-  // check : check if cracked
+  return 0;
+}
 
-  u32 num_cracked = 0;
+static int selftest_cleanup (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, u32 *num_cracked)
+{
+  user_options_t       *user_options       = hashcat_ctx->user_options;
+  user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;
+
+  // check : check if cracked
 
   cl_event opencl_event;
 
   if (device_param->is_cuda == true)
   {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, num_cracked, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_cuEventRecord (hashcat_ctx, device_param->cuda_event3, device_param->cuda_stream) == -1) return -1;
   }
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, num_cracked, device_param->hip_d_result, sizeof (u32)) == -1) return -1;
 
     if (hc_hipEventRecord (hashcat_ctx, device_param->hip_event3, device_param->hip_stream) == -1) return -1;
   }
@@ -956,13 +977,13 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
   #if defined (__APPLE__)
   if (device_param->is_metal == true)
   {
-    if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, &num_cracked, device_param->metal_d_result, 0, sizeof (u32)) == -1) return -1;
+    if (hc_mtlMemcpyDtoH (hashcat_ctx, device_param->metal_command_queue, num_cracked, device_param->metal_d_result, 0, sizeof (u32)) == -1) return -1;
   }
   #endif
 
   if (device_param->is_opencl == true)
   {
-    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_FALSE, 0, sizeof (u32), &num_cracked, 0, NULL, &opencl_event) == -1) return -1;
+    if (hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_result, CL_TRUE, 0, sizeof (u32), num_cracked, 0, NULL, &opencl_event) == -1) return -1;
 
     if (hc_clFlush (hashcat_ctx, device_param->opencl_command_queue) == -1) return -1;
   }
@@ -1154,7 +1175,27 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
     if (hc_clReleaseEvent (hashcat_ctx, opencl_event) == -1) return -1;
   }
 
+  return 0;
+}
+
+static int process_selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
+{
+  hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+  status_ctx_t *status_ctx = hashcat_ctx->status_ctx;
+
+  if (hashconfig->st_hash == NULL) return 0;
+
+  u32 highest_pw_len = 0;
+  u32 num_cracked = 0;
+
+  if (selftest_init (hashcat_ctx, device_param, &highest_pw_len) == -1) return -1;
+
+  if (selftest_run_kernel (hashcat_ctx, device_param, highest_pw_len) == -1) return -1;
+
+  if (selftest_cleanup (hashcat_ctx, device_param, &num_cracked) == -1) return -1;
+
   // check return
+
   if (num_cracked == 0)
   {
     hc_thread_mutex_lock (status_ctx->mux_display);
@@ -1169,20 +1210,22 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! HIP kernel self-test failed.", device_param->device_id + 1);
     }
 
-    #if defined (__APPLE__)
     if (device_param->is_metal == true)
     {
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! Metal kernel self-test failed.", device_param->device_id + 1);
     }
-    #endif
 
     if (device_param->is_opencl == true)
     {
       event_log_error (hashcat_ctx, "* Device #%u: ATTENTION! OpenCL kernel self-test failed.", device_param->device_id + 1);
     }
 
-    event_log_warning (hashcat_ctx, "Your device driver installation is probably broken.");
-    event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+    if (device_param->is_metal == false)
+    {
+      event_log_warning (hashcat_ctx, "Your device driver installation is probably broken.");
+      event_log_warning (hashcat_ctx, "See also: https://hashcat.net/faq/wrongdriver");
+    }
+
     event_log_warning (hashcat_ctx, NULL);
 
     hc_thread_mutex_unlock (status_ctx->mux_display);
@@ -1229,10 +1272,10 @@ HC_API_CALL void *thread_selftest (void *p)
 
   if (device_param->is_hip == true)
   {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
   }
 
-  const int rc_selftest = selftest (hashcat_ctx, device_param);
+  const int rc_selftest = process_selftest (hashcat_ctx, device_param);
 
   if (user_options->benchmark == true)
   {
@@ -1260,8 +1303,6 @@ HC_API_CALL void *thread_selftest (void *p)
   if (device_param->is_hip == true)
   {
     if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return NULL;
-
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
   }
 
   if (bridge_ctx->enabled == true)
diff --git a/src/shared.c b/src/shared.c
index a00025352..0ff206815 100644
--- a/src/shared.c
+++ b/src/shared.c
@@ -104,6 +104,9 @@ static const char *const OPTI_STR_USES_BITS_8          = "Uses-8-Bit";
 static const char *const OPTI_STR_USES_BITS_16         = "Uses-16-Bit";
 static const char *const OPTI_STR_USES_BITS_32         = "Uses-32-Bit";
 static const char *const OPTI_STR_USES_BITS_64         = "Uses-64-Bit";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_INIT  = "Slow-Hash-DimensionY-INIT";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_LOOP  = "Slow-Hash-DimensionY-LOOP";
+static const char *const OPTI_STR_SLOW_HASH_DIMY_COMP  = "Slow-Hash-DimensionY-COMP";
 
 static const char *const HASH_CATEGORY_UNDEFINED_STR              = "Undefined";
 static const char *const HASH_CATEGORY_RAW_HASH_STR               = "Raw Hash";
@@ -1072,6 +1075,9 @@ const char *stroptitype (const u32 opti_type)
     case OPTI_TYPE_SLOW_HASH_SIMD_LOOP:  return OPTI_STR_SLOW_HASH_SIMD_LOOP;
     case OPTI_TYPE_SLOW_HASH_SIMD_LOOP2: return OPTI_STR_SLOW_HASH_SIMD_LOOP2;
     case OPTI_TYPE_SLOW_HASH_SIMD_COMP:  return OPTI_STR_SLOW_HASH_SIMD_COMP;
+    case OPTI_TYPE_SLOW_HASH_DIMY_INIT:  return OPTI_STR_SLOW_HASH_DIMY_INIT;
+    case OPTI_TYPE_SLOW_HASH_DIMY_LOOP:  return OPTI_STR_SLOW_HASH_DIMY_LOOP;
+    case OPTI_TYPE_SLOW_HASH_DIMY_COMP:  return OPTI_STR_SLOW_HASH_DIMY_COMP;
     case OPTI_TYPE_USES_BITS_8:          return OPTI_STR_USES_BITS_8;
     case OPTI_TYPE_USES_BITS_16:         return OPTI_STR_USES_BITS_16;
     case OPTI_TYPE_USES_BITS_32:         return OPTI_STR_USES_BITS_32;
@@ -1246,15 +1252,52 @@ int input_tokenizer (const u8 *input_buf, const int input_len, hc_token_t *token
     {
       const int len = token->len[token_idx];
 
-      token->buf[token_idx + 1] = token->buf[token_idx] + len;
-
-      len_left -= len;
-
-      if (token->sep[token_idx] != 0)
+      if (len)
       {
-        token->buf[token_idx + 1]++; // +1 = separator
+        token->buf[token_idx + 1] = token->buf[token_idx] + len;
 
-        len_left--; // -1 = separator
+        len_left -= len;
+
+        if (token->sep[token_idx] != 0)
+        {
+          token->buf[token_idx + 1]++; // +1 = separator
+
+          len_left--; // -1 = separator
+        }
+      }
+
+      const int len_min = token->len_min[token_idx];
+      const int len_max = token->len_max[token_idx];
+
+      if (len_max)
+      {
+        bool matched = false;
+
+        if (token->attr[token_idx] & TOKEN_ATTR_VERIFY_SIGNATURE)
+        {
+          for (int signature_idx = 0; signature_idx < token->signatures_cnt; signature_idx++)
+          {
+            const int len_sig = strlen (token->signatures_buf[signature_idx]);
+
+            if (len_sig > len_left) continue;
+
+            if ((len_sig >= len_min) && (len_sig <= len_max))
+            {
+              if (memcmp (token->buf[token_idx], token->signatures_buf[signature_idx], len_sig) == 0)
+              {
+                token->len[token_idx] = len_sig;
+
+                token->buf[token_idx + 1] = token->buf[token_idx] + len_sig;
+
+                len_left -= len_sig;
+
+                matched = true;
+              }
+            }
+          }
+
+          if (matched == false) return (PARSER_SIGNATURE_UNMATCHED);
+        }
       }
     }
   }
diff --git a/src/straight.c b/src/straight.c
index 36d3a8eb8..58212be52 100644
--- a/src/straight.c
+++ b/src/straight.c
@@ -264,11 +264,11 @@ int straight_ctx_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
   if (user_options->version      == true) return 0;
-  if (user_options->hash_info    == true) return 0;
 
   if (user_options->attack_mode  == ATTACK_MODE_BF) return 0;
 
diff --git a/src/terminal.c b/src/terminal.c
index b76c64048..258d02f2e 100644
--- a/src/terminal.c
+++ b/src/terminal.c
@@ -28,12 +28,13 @@ void welcome_screen (hashcat_ctx_t *hashcat_ctx, const char *version_tag)
 {
   const user_options_t *user_options = hashcat_ctx->user_options;
 
-  if (user_options->quiet       == true) return;
-  if (user_options->keyspace    == true) return;
-  if (user_options->stdout_flag == true) return;
-  if (user_options->show        == true) return;
-  if (user_options->left        == true) return;
-  if (user_options->identify    == true) return;
+  if (user_options->quiet       == true)      return;
+  if (user_options->keyspace    == true)      return;
+  if (user_options->total_candidates == true) return;
+  if (user_options->stdout_flag == true)      return;
+  if (user_options->show        == true)      return;
+  if (user_options->left        == true)      return;
+  if (user_options->identify    == true)      return;
 
   if (user_options->usage > 0)
   {
@@ -88,15 +89,18 @@ void welcome_screen (hashcat_ctx_t *hashcat_ctx, const char *version_tag)
   }
   else if (user_options->backend_info > 0)
   {
-    event_log_info (hashcat_ctx, "%s (%s) starting in backend information mode", PROGNAME, version_tag);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "%s (%s) starting in backend information mode", PROGNAME, version_tag);
+      event_log_info (hashcat_ctx, NULL);
+    }
   }
   else if (user_options->hash_mode_chgd == false)
   {
     event_log_info (hashcat_ctx, "%s (%s) starting in autodetect mode", PROGNAME, version_tag);
     event_log_info (hashcat_ctx, NULL);
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
     event_log_info (hashcat_ctx, "%s (%s) starting in hash-info mode", PROGNAME, version_tag);
     event_log_info (hashcat_ctx, NULL);
@@ -648,7 +652,7 @@ void compress_terminal_line_length (char *out_buf, const size_t keep_from_beginn
   *ptr1 = 0;
 }
 
-void json_encode (char *text, char *escaped)
+void json_encode (const char *text, char *escaped)
 {
   /*
    * Based on https://www.freeformatter.com/json-escape.html, below these 7 different chars
@@ -681,6 +685,8 @@ void json_encode (char *text, char *escaped)
 
 void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_options_extra)
 {
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
   if (hashconfig_init (hashcat_ctx) == 0)
   {
     hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
@@ -691,25 +697,70 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     printf ("\"category\": \"%s\", ", strhashcategory (hashconfig->hash_category));
     printf ("\"slow_hash\": %s, ", (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? "false" : "true");
 
-    printf ("\"password_len_min\": %u, ", hashconfig->pw_min);
-    printf ("\"password_len_max\": %u, ", hashconfig->pw_max);
-
     printf ("\"is_deprecated\": %s, ", (module_ctx->module_deprecated_notice != MODULE_DEFAULT) ? "true" : "false");
 
-    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT) {
-      const char *deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
-      printf ("\"deprecated_notice\": \"%s\", ", deprecated_notice);
+    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT)
+    {
+      const char *t_deprecated_notice = module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
+
+      char *t_deprecated_notice_json_encoded = (char *) hcmalloc (strlen (t_deprecated_notice) * 2);
+
+      json_encode (t_deprecated_notice, t_deprecated_notice_json_encoded);
+
+      printf ("\"deprecated_notice\": \"%s\", ", t_deprecated_notice_json_encoded);
+
+      hcfree (t_deprecated_notice_json_encoded);
     }
+    else
+    {
+      printf ("\"deprecated_notice\": \"%s\", ", "N/A");
+    }
+
+    const char *t_pw_desc = (hashconfig->opts_type & OPTS_TYPE_PT_HEX) ? "HEX" : "plain";
+
+    u32 t_pw_min = hashconfig->pw_min;
+    u32 t_pw_max = hashconfig->pw_max;
+
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_PT_HEX)
+      {
+        t_pw_min *= 2;
+        t_pw_max *= 2;
+      }
+    }
+
+    printf ("\"password_type\": \"%s\", ", t_pw_desc);
+    printf ("\"password_len_min\": %u, ", t_pw_min);
+    printf ("\"password_len_max\": %u, ", t_pw_max);
 
     printf ("\"is_salted\": %s, ", (hashconfig->is_salted == true) ? "true" : "false");
 
     if (hashconfig->is_salted == true)
     {
       u32 t = hashconfig->salt_type;
-      const char *t_desc = (t == SALT_TYPE_EMBEDDED) ? "embedded" : (t == SALT_TYPE_GENERIC) ? "generic" : "virtual";
-      printf ("\"salt_type\": \"%s\", ", t_desc);
-      printf ("\"salt_len_min\": %u, ", hashconfig->salt_min);
-      printf ("\"salt_len_max\": %u, ", hashconfig->salt_max);
+
+      const char *t_salt_desc = (t == SALT_TYPE_EMBEDDED) ? "embedded" : (t == SALT_TYPE_GENERIC) ? "generic" : "virtual";
+
+      printf ("\"salt_type\": \"%s\", ", t_salt_desc);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        u32 t_salt_min = hashconfig->salt_min;
+        u32 t_salt_max = hashconfig->salt_max;
+
+        if (user_options->hash_info > 1)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_ST_HEX)
+          {
+            t_salt_min *= 2;
+            t_salt_max *= 2;
+          }
+        }
+
+        printf ("\"salt_len_min\": %u, ", t_salt_min);
+        printf ("\"salt_len_max\": %u, ", t_salt_max);
+      }
     }
 
     if ((hashconfig->has_pure_kernel) && (hashconfig->has_optimized_kernel))
@@ -725,6 +776,39 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
       printf ("\"kernel_type\": %s, ", "[ \"optimized\" ]");
     }
 
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        printf ("\"kernel_type_filter\": %s, ", "[ \"optimized\" ]");
+      }
+      else
+      {
+        printf ("\"kernel_type_filter\": %s, ", "[ \"pure\" ]");
+      }
+
+      printf ("\"attack_mode_filter\": %d, ", user_options->attack_mode);
+
+      // almost always 1 and -1
+      printf ("\"hashes_count_min\": %d, ", hashconfig->hashes_count_min);
+      printf ("\"hashes_count_max\": %d, ", hashconfig->hashes_count_max);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        bool multi_hash_same_salt = true;
+
+        if ((hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL) == 0)
+        {
+          if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+          {
+            multi_hash_same_salt = false;
+          }
+        }
+
+        printf ("\"hashes_with_same_salt\": %s, ", (multi_hash_same_salt == true) ? "true" : "false");
+      }
+    }
+
     if ((hashconfig->st_hash != NULL) && (hashconfig->st_pass != NULL))
     {
       if (hashconfig->opts_type & OPTS_TYPE_BINARY_HASHFILE)
@@ -745,7 +829,7 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
 
       char *example_hash_json_encoded = (char *) hcmalloc (strlen (hashconfig->st_hash) * 2);
 
-      json_encode ((char *)hashconfig->st_hash, example_hash_json_encoded);
+      json_encode (hashconfig->st_hash, example_hash_json_encoded);
 
       printf ("\"example_hash\": \"%s\", ", example_hash_json_encoded);
 
@@ -821,6 +905,7 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     printf ("\"autodetect_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_AUTODETECT_DISABLE) ? "false" : "true");
     printf ("\"self_test_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) ? "false" : "true");
     printf ("\"potfile_enabled\": %s, ", (hashconfig->opts_type & OPTS_TYPE_POTFILE_NOPASS) ? "false" : "true");
+    printf ("\"keep_guessing\": %s, ", (hashconfig->opts_type & OPTS_TYPE_SUGGEST_KG) ? "true" : "false");
     printf ("\"custom_plugin\": %s, ", (hashconfig->opts_type & OPTS_TYPE_STOCK_MODULE) ? "false" : "true");
 
     if (hashconfig->opts_type & OPTS_TYPE_PT_ALWAYS_ASCII)
@@ -835,8 +920,6 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
     {
       printf ("\"plaintext_encoding\": %s", "[ \"ASCII\", \"HEX\" ]");
     }
-
-    event_log_info (hashcat_ctx, NULL);
   }
 
   printf (" }");
@@ -846,30 +929,73 @@ void hash_info_single_json (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *us
 
 void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_options_extra)
 {
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
   if (hashconfig_init (hashcat_ctx) == 0)
   {
     hashconfig_t *hashconfig = hashcat_ctx->hashconfig;
+    module_ctx_t *module_ctx = hashcat_ctx->module_ctx;
 
     event_log_info (hashcat_ctx, "Hash mode #%u", hashconfig->hash_mode);
     event_log_info (hashcat_ctx, "  Name................: %s", hashconfig->hash_name);
     event_log_info (hashcat_ctx, "  Category............: %s", strhashcategory (hashconfig->hash_category));
     event_log_info (hashcat_ctx, "  Slow.Hash...........: %s", (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) ? "No" : "Yes");
 
-    event_log_info (hashcat_ctx, "  Password.Len.Min....: %u", hashconfig->pw_min);
-    event_log_info (hashcat_ctx, "  Password.Len.Max....: %u", hashconfig->pw_max);
+    event_log_info (hashcat_ctx, "  Deprecated..........: %s", (module_ctx->module_deprecated_notice != MODULE_DEFAULT) ? "Yes" : "No");
+
+    char *t_deprecated_notice = "N/A\0";
+
+    if (module_ctx->module_deprecated_notice != MODULE_DEFAULT)
+    {
+      t_deprecated_notice = (char *) module_ctx->module_deprecated_notice (hashconfig, hashcat_ctx->user_options, user_options_extra);
+    }
+
+    event_log_info (hashcat_ctx, "  Deprecated.Notice...: %s", t_deprecated_notice);
+
+    const char *t_pw_desc = (hashconfig->opts_type & OPTS_TYPE_PT_HEX) ? "HEX\0" : "plain\0";
+
+    u32 t_pw_min = hashconfig->pw_min;
+    u32 t_pw_max = hashconfig->pw_max;
+
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opts_type & OPTS_TYPE_PT_HEX)
+      {
+        t_pw_min *= 2;
+        t_pw_max *= 2;
+      }
+    }
+
+    event_log_info (hashcat_ctx, "  Password.Type.......: %s", t_pw_desc);
+    event_log_info (hashcat_ctx, "  Password.Len.Min....: %u", t_pw_min);
+    event_log_info (hashcat_ctx, "  Password.Len.Max....: %u", t_pw_max);
 
     if (hashconfig->is_salted == true)
     {
       u32 t = hashconfig->salt_type;
-      const char *t_desc = (t == SALT_TYPE_EMBEDDED) ? "Embedded\0" : (t == SALT_TYPE_GENERIC) ? "Generic\0" : "Virtual\0";
-      event_log_info (hashcat_ctx, "  Salt.Type...........: %s", t_desc);
-      event_log_info (hashcat_ctx, "  Salt.Len.Min........: %u", hashconfig->salt_min);
-      event_log_info (hashcat_ctx, "  Salt.Len.Max........: %u", hashconfig->salt_max);
-    }
 
-    // almost always 1 and -1
-    //event_log_info (hashcat_ctx, "  Hashes.Count.Min....: %d", hashconfig->hashes_count_min);
-    //event_log_info (hashcat_ctx, "  Hashes.Count.Max....: %u", hashconfig->hashes_count_max);
+      const char *t_salt_desc = (t == SALT_TYPE_EMBEDDED) ? "Embedded\0" : (t == SALT_TYPE_GENERIC) ? "Generic\0" : "Virtual\0";
+
+      event_log_info (hashcat_ctx, "  Salt.Type...........: %s", t_salt_desc);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        u32 t_salt_min = hashconfig->salt_min;
+        u32 t_salt_max = hashconfig->salt_max;
+
+        if (user_options->hash_info > 1)
+        {
+          if (hashconfig->opts_type & OPTS_TYPE_ST_HEX)
+          {
+            t_salt_min *= 2;
+            t_salt_max *= 2;
+          }
+        }
+
+        event_log_info (hashcat_ctx, "  Salt.Len.Min........: %u", t_salt_min);
+        event_log_info (hashcat_ctx, "  Salt.Len.Max........: %u", t_salt_max);
+      }
+    }
 
     if ((hashconfig->has_pure_kernel) && (hashconfig->has_optimized_kernel))
     {
@@ -884,6 +1010,39 @@ void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_op
       event_log_info (hashcat_ctx, "  Kernel.Type(s)......: optimized");
     }
 
+    if (user_options->hash_info > 1)
+    {
+      if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+      {
+        event_log_info (hashcat_ctx, "  Kernel.Type.Filter..: optimized");
+      }
+      else
+      {
+        event_log_info (hashcat_ctx, "  Kernel.Type.Filter..: pure");
+      }
+
+      event_log_info (hashcat_ctx, "  Attack.Mode.Filter..: %u", user_options->attack_mode);
+
+      // almost always 1 and -1
+      event_log_info (hashcat_ctx, "  Hashes.Count.Min....: %d", hashconfig->hashes_count_min);
+      event_log_info (hashcat_ctx, "  Hashes.Count.Max....: %d", hashconfig->hashes_count_max);
+
+      if (hashconfig->salt_type == SALT_TYPE_GENERIC || hashconfig->salt_type == SALT_TYPE_EMBEDDED)
+      {
+        bool multi_hash_same_salt = true;
+
+        if ((hashconfig->opts_type & OPTS_TYPE_DEEP_COMP_KERNEL) == 0)
+        {
+          if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+          {
+            multi_hash_same_salt = false;
+          }
+        }
+
+        event_log_info (hashcat_ctx, "  Hashes.w/.Same.Salt.: %s", (multi_hash_same_salt == true) ? "Allowed" : "Not allowed");
+      }
+    }
+
     if ((hashconfig->st_hash != NULL) && (hashconfig->st_pass != NULL))
     {
       if (hashconfig->opts_type & OPTS_TYPE_BINARY_HASHFILE)
@@ -987,6 +1146,7 @@ void hash_info_single (hashcat_ctx_t *hashcat_ctx, user_options_extra_t *user_op
     event_log_info (hashcat_ctx, "  Autodetect.Enabled..: %s", (hashconfig->opts_type & OPTS_TYPE_AUTODETECT_DISABLE) ? "No" : "Yes");
     event_log_info (hashcat_ctx, "  Self.Test.Enabled...: %s", (hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) ? "No" : "Yes");
     event_log_info (hashcat_ctx, "  Potfile.Enabled.....: %s", (hashconfig->opts_type & OPTS_TYPE_POTFILE_NOPASS) ? "No" : "Yes");
+    event_log_info (hashcat_ctx, "  Keep.Guessing.......: %s", (hashconfig->opts_type & OPTS_TYPE_SUGGEST_KG) ? "Yes" : "No");
     event_log_info (hashcat_ctx, "  Custom.Plugin.......: %s", (hashconfig->opts_type & OPTS_TYPE_STOCK_MODULE) ? "No" : "Yes");
 
     if (hashconfig->opts_type & OPTS_TYPE_PT_ALWAYS_ASCII)
@@ -1075,18 +1235,43 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
   const user_options_t  *user_options  = hashcat_ctx->user_options;
   const folder_config_t *folder_config = hashcat_ctx->folder_config;
 
+  if (user_options->machine_readable == true)
+  {
+    printf ("{ ");
+  }
+
   if (user_options->backend_info > 1)
   {
-    event_log_info (hashcat_ctx, "System Info:");
-    event_log_info (hashcat_ctx, "============");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "System Info:");
+      event_log_info (hashcat_ctx, "============");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"SystemInfo\": { ");
+    }
 
     #if defined (_WIN) || defined (__CYGWIN__) || defined (__MSYS__)
     // TODO
-    event_log_info (hashcat_ctx, "OS.Name......: Windows");
-    event_log_info (hashcat_ctx, "OS.Release...: N/A");
-    event_log_info (hashcat_ctx, "HW.Platform..: N/A");
-    event_log_info (hashcat_ctx, "HW.Model.....: N/A");
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OS.Name......: Windows");
+      event_log_info (hashcat_ctx, "OS.Release...: N/A");
+      event_log_info (hashcat_ctx, "HW.Platform..: N/A");
+      event_log_info (hashcat_ctx, "HW.Model.....: N/A");
+    }
+    else
+    {
+      printf ("\"OS\": { ");
+      printf ("\"Name\": \"%s\", ", "Windows");
+      printf ("\"Release\": \"%s\" }, ", "N/A");
+      printf ("\"Hardware\": { ");
+      printf ("\"Platform\": \"%s\", ", "N/A");
+      printf ("\"Model\": \"%s\" } ", "N/A");
+      printf ("}, ");
+    }
     #else
 
     struct utsname utsbuf;
@@ -1123,10 +1308,23 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       rc_uname = true;
     }
 
-    event_log_info (hashcat_ctx, "OS.Name......: %s", (rc_uname  == true) ? utsbuf.sysname : "N/A");
-    event_log_info (hashcat_ctx, "OS.Release...: %s", (rc_uname  == true) ? utsbuf.release : "N/A");
-    event_log_info (hashcat_ctx, "HW.Model.....: %s", (rc_sysctl == true) ? hw_model_buf   : "N/A");
-    event_log_info (hashcat_ctx, "HW.Platform..: %s", (rc_uname  == true) ? utsbuf.machine : "N/A");
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OS.Name......: %s", (rc_uname  == true) ? utsbuf.sysname : "N/A");
+      event_log_info (hashcat_ctx, "OS.Release...: %s", (rc_uname  == true) ? utsbuf.release : "N/A");
+      event_log_info (hashcat_ctx, "HW.Platform..: %s", (rc_uname  == true) ? utsbuf.machine : "N/A");
+      event_log_info (hashcat_ctx, "HW.Model.....: %s", (rc_sysctl == true) ? hw_model_buf   : "N/A");
+    }
+    else
+    {
+      printf ("\"OS\": { ");
+      printf ("\"Name\": \"%s\", ", (rc_uname  == true) ? utsbuf.sysname : "N/A");
+      printf ("\"Release\": \"%s\" }, ", (rc_uname  == true) ? utsbuf.release : "N/A");
+      printf ("\"Hardware\": { ");
+      printf ("\"Platform\": \"%s\", ", (rc_uname  == true) ? utsbuf.machine : "N/A");
+      printf ("\"Model\": \"%s\" } ", (rc_sysctl == true) ? hw_model_buf : "N/A");
+      printf ("}, ");
+    }
 
     if (rc_sysctl == true)
     {
@@ -1134,38 +1332,72 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
     }
     #endif // _WIN || __CYGWIN__ || __MSYS__
 
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, NULL);
 
-    event_log_info (hashcat_ctx, "Environment Info:");
-    event_log_info (hashcat_ctx, "=================");
-    event_log_info (hashcat_ctx, NULL);
+      event_log_info (hashcat_ctx, "Environment Info:");
+      event_log_info (hashcat_ctx, "=================");
+      event_log_info (hashcat_ctx, NULL);
 
-    event_log_info (hashcat_ctx, "Cur.Work.Dir.: %s", folder_config->cwd);
-    event_log_info (hashcat_ctx, "Install.Dir..: %s", folder_config->install_dir);
-    event_log_info (hashcat_ctx, "Profile.Dir..: %s", folder_config->profile_dir);
-    event_log_info (hashcat_ctx, "Cache.Dir....: %s", folder_config->cache_dir);
-    // uninitialized at this point, for instance if the user uses --session
-    //event_log_info (hashcat_ctx, "Session.Dir..: %s", folder_config->session_dir);
-    event_log_info (hashcat_ctx, "Shared.Dir...: %s", folder_config->shared_dir);
-    event_log_info (hashcat_ctx, "CL.Inc.Path..: %s", folder_config->cpath_real);
+      event_log_info (hashcat_ctx, "Cur.Work.Dir.: %s", folder_config->cwd);
+      event_log_info (hashcat_ctx, "Install.Dir..: %s", folder_config->install_dir);
+      event_log_info (hashcat_ctx, "Profile.Dir..: %s", folder_config->profile_dir);
+      event_log_info (hashcat_ctx, "Cache.Dir....: %s", folder_config->cache_dir);
+      // uninitialized at this point, for instance if the user uses --session
+      //event_log_info (hashcat_ctx, "Session.Dir..: %s", folder_config->session_dir);
+      event_log_info (hashcat_ctx, "Shared.Dir...: %s", folder_config->shared_dir);
+      event_log_info (hashcat_ctx, "CL.Inc.Path..: %s", folder_config->cpath_real);
 
-    event_log_info (hashcat_ctx, NULL);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"EnvironmentInfo\": { ");
+      printf ("\"CurrentWorkingDirectory\": \"%s\", ", folder_config->cwd);
+      printf ("\"InstallDirectory\": \"%s\", ", folder_config->install_dir);
+      printf ("\"ProfileDirectory\": \"%s\", ", folder_config->profile_dir);
+      printf ("\"CacheDirectory\": \"%s\", ", folder_config->cache_dir);
+      printf ("\"SharedDirectory\": \"%s\", ", folder_config->shared_dir);
+      printf ("\"CLIncludePath\": \"%s\" ", folder_config->cpath_real);
+      printf ("}, ");
+    }
   }
 
   if (backend_ctx->cuda)
   {
-    event_log_info (hashcat_ctx, "CUDA Info:");
-    event_log_info (hashcat_ctx, "==========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "CUDA Info:");
+      event_log_info (hashcat_ctx, "==========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"CUDAInfo\": { ");
+    }
 
     int cuda_devices_cnt    = backend_ctx->cuda_devices_cnt;
     int cuda_driver_version = backend_ctx->cuda_driver_version;
 
-    event_log_info (hashcat_ctx, "CUDA.Version.: %u.%u", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "CUDA.Version.: %u.%u", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"Version\": \"%u.%u\", ", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);
+      printf ("\"BackendDevices\": [ ");
+    }
 
     for (int cuda_devices_idx = 0; cuda_devices_idx < cuda_devices_cnt; cuda_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_cuda[cuda_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1184,29 +1416,88 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-      event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+        event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+        printf ("\"PCI.Addr.BDFe\": \"%04x:%02x:%02x.%u\" ", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((cuda_devices_idx + 1) < cuda_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->hip || backend_ctx->mtl || backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
 
   if (backend_ctx->hip)
   {
-    event_log_info (hashcat_ctx, "HIP Info:");
-    event_log_info (hashcat_ctx, "=========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "HIP Info:");
+      event_log_info (hashcat_ctx, "=========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"HIPInfo\": { ");
+    }
 
     int hip_devices_cnt    = backend_ctx->hip_devices_cnt;
     int hip_runtimeVersion = backend_ctx->hip_runtimeVersion;
@@ -1217,17 +1508,41 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       int hip_version_minor = (hip_runtimeVersion - (hip_version_major * 10000000)) / 100000;
       int hip_version_patch = (hip_runtimeVersion - (hip_version_major * 10000000) - (hip_version_minor * 100000));
 
-      event_log_info (hashcat_ctx, "HIP.Version.: %u.%u.%u", hip_version_major, hip_version_minor, hip_version_patch);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "HIP.Version.: %u.%u.%u", hip_version_major, hip_version_minor, hip_version_patch);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Version\": \"%u.%u.%u\", ", hip_version_major, hip_version_minor, hip_version_patch);
+      }
     }
     else
     {
-      event_log_info (hashcat_ctx, "HIP.Version.: %u.%u", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "HIP.Version.: %u.%u", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Version\": \"%u.%u\", ", hip_runtimeVersion / 100, hip_runtimeVersion % 10);
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("\"BackendDevices\": [ ");
     }
 
     for (int hip_devices_idx = 0; hip_devices_idx < hip_devices_cnt; hip_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_hip[hip_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1246,40 +1561,116 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-      event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: %u", device_maxclock_frequency);
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB", device_global_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+        event_log_info (hashcat_ctx, "  PCI.Addr.BDFe..: %04x:%02x:%02x.%u", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+        printf ("\"PCI.Addr.BDFe\": \"%04x:%02x:%02x.%u\" ", (u16) pcie_domain, pcie_bus, pcie_device, pcie_function);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((hip_devices_idx + 1) < hip_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->mtl || backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
 
   #if defined (__APPLE__)
   if (backend_ctx->mtl)
   {
-    event_log_info (hashcat_ctx, "Metal Info:");
-    event_log_info (hashcat_ctx, "===========");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "Metal Info:");
+      event_log_info (hashcat_ctx, "===========");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"MetalInfo\": { ");
+    }
 
     int metal_devices_cnt = backend_ctx->metal_devices_cnt;
 
     char *metal_runtimeVersionStr = backend_ctx->metal_runtimeVersionStr;
 
-    event_log_info (hashcat_ctx, "Metal.Version.: %s", metal_runtimeVersionStr);
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "Metal.Version.: %s", metal_runtimeVersionStr);
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"Version\": \"%s\", ", metal_runtimeVersionStr);
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("\"BackendDevices\": [ ");
+    }
 
     for (int metal_devices_idx = 0; metal_devices_idx < metal_devices_cnt; metal_devices_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       const int backend_devices_idx = backend_ctx->backend_device_from_metal[metal_devices_idx];
 
       const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1310,30 +1701,111 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
       if (device_param->device_id_alias_cnt)
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+        }
       }
       else
       {
-        event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "Backend Device ID #%02u", device_id + 1);
+        }
+        else
+        {
+          printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+        }
       }
 
-      event_log_info (hashcat_ctx, "  Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
-      event_log_info (hashcat_ctx, "  Vendor.ID......: %u", opencl_device_vendor_id);
-      event_log_info (hashcat_ctx, "  Vendor.........: %s", opencl_device_vendor);
-      event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
-      event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
-      event_log_info (hashcat_ctx, "  Clock..........: N/A");
-      event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-      event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+        event_log_info (hashcat_ctx, "  Vendor.ID......: %u", opencl_device_vendor_id);
+        event_log_info (hashcat_ctx, "  Vendor.........: %s", opencl_device_vendor);
+        event_log_info (hashcat_ctx, "  Name...........: %s", device_name);
+        event_log_info (hashcat_ctx, "  Processor(s)...: %u", device_processors);
+        event_log_info (hashcat_ctx, "  Clock..........: N/A");
+        event_log_info (hashcat_ctx, "  Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+        event_log_info (hashcat_ctx, "  Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+      }
+      else
+      {
+        printf ("\"Type\": \"%s\", ", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+        printf ("\"VendorID\": \"%u\", ", opencl_device_vendor_id);
+        printf ("\"Vendor\": \"%s\", ", opencl_device_vendor);
+        printf ("\"Name\": \"%s\", ", device_name);
+        printf ("\"Processor(s)\": \"%u\", ", device_processors);
+        printf ("\"Clock\": \"%s\", ", "N/A");
+        printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+        printf ("\"MemoryAllocPerBlock\": \"%" PRIu64 " MB\", ", device_maxmem_alloc / 1024 / 1024);
+        printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+        printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+      }
 
       switch (device_physical_location)
       {
-        case MTL_DEVICE_LOCATION_BUILTIN:     event_log_info (hashcat_ctx, "  Phys.Location..: built-in"); break;
-        case MTL_DEVICE_LOCATION_SLOT:        event_log_info (hashcat_ctx, "  Phys.Location..: connected to slot %u", device_location_number); break;
-        case MTL_DEVICE_LOCATION_EXTERNAL:    event_log_info (hashcat_ctx, "  Phys.Location..: connected via an external interface (port %u)", device_location_number); break;
-        case MTL_DEVICE_LOCATION_UNSPECIFIED: event_log_info (hashcat_ctx, "  Phys.Location..: unspecified"); break;
-        default:                              event_log_info (hashcat_ctx, "  Phys.Location..: N/A"); break;
+        case MTL_DEVICE_LOCATION_BUILTIN:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: built-in");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"built-in\", ");
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_SLOT:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: connected to slot %u", device_location_number);
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"connected to slot %u\", ", device_location_number);
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_EXTERNAL:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: connected via an external interface (port %u)", device_location_number);
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"connected via an external interface (port %u)\", ", device_location_number);
+          }
+
+          break;
+        case MTL_DEVICE_LOCATION_UNSPECIFIED:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: unspecified");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"unspecified\", ");
+          }
+
+          break;
+        default:
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Phys.Location..: N/A");
+          }
+          else
+          {
+            printf ("\"PhysicalLocation\": \"%s\", ", "N/A");
+          }
+
+          break;
       }
 
       /*
@@ -1347,28 +1819,92 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
       }
       */
 
-      event_log_info (hashcat_ctx, "  Registry.ID....: %u", device_registryID);
-
-      if (device_physical_location != MTL_DEVICE_LOCATION_BUILTIN)
+      if (user_options->machine_readable == false)
       {
-        event_log_info (hashcat_ctx, "  Max.TX.Rate....: %u MB/sec", device_max_transfer_rate);
+        event_log_info (hashcat_ctx, "  Registry.ID....: %u", device_registryID);
       }
       else
       {
-        event_log_info (hashcat_ctx, "  Max.TX.Rate....: N/A");
+        printf ("\"RegistryID\": \"%u\", ", device_registryID);
       }
 
-      event_log_info (hashcat_ctx, "  GPU.Properties.: headless %u, low-power %u, removable %u", device_is_headless, device_is_low_power, device_is_removable);
-      event_log_info (hashcat_ctx, NULL);
+      if (device_physical_location != MTL_DEVICE_LOCATION_BUILTIN)
+      {
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "  Max.TX.Rate....: %u MB/sec", device_max_transfer_rate);
+        }
+        else
+        {
+          printf ("\"MaxTXRate\": \"%u MB/sec\", ", device_max_transfer_rate);
+        }
+      }
+      else
+      {
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "  Max.TX.Rate....: N/A");
+        }
+        else
+        {
+          printf ("\"MaxTXRate\": \"%s\", ", "N/A");
+        }
+      }
+
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "  GPU.Properties.: headless %u, low-power %u, removable %u", device_is_headless, device_is_low_power, device_is_removable);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"GPUProperties\": { ");
+        printf ("\"headless\": \"%u\", ", device_is_headless);
+        printf ("\"low_power\": \"%u\", ", device_is_low_power);
+        printf ("\"removable\": \"%u\" ", device_is_removable);
+        printf ("} ");
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((metal_devices_idx + 1) < metal_devices_cnt)
+        {
+          printf ("}, ");
+        }
+        else
+        {
+          printf ("} ");
+        }
+      }
+    }
+
+    if (user_options->machine_readable == true)
+    {
+      if (backend_ctx->ocl)
+      {
+        printf ("] }, ");
+      }
+      else
+      {
+        printf ("] } ");
+      }
     }
   }
   #endif
 
   if (backend_ctx->ocl)
   {
-    event_log_info (hashcat_ctx, "OpenCL Info:");
-    event_log_info (hashcat_ctx, "============");
-    event_log_info (hashcat_ctx, NULL);
+    if (user_options->machine_readable == false)
+    {
+      event_log_info (hashcat_ctx, "OpenCL Info:");
+      event_log_info (hashcat_ctx, "============");
+      event_log_info (hashcat_ctx, NULL);
+    }
+    else
+    {
+      printf ("\"OpenCLInfo\": { ");
+      printf ("\"Platforms\": [ ");
+    }
 
     cl_uint   opencl_platforms_cnt         = backend_ctx->opencl_platforms_cnt;
     cl_uint  *opencl_platforms_devices_cnt = backend_ctx->opencl_platforms_devices_cnt;
@@ -1378,19 +1914,44 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
     for (cl_uint opencl_platforms_idx = 0; opencl_platforms_idx < opencl_platforms_cnt; opencl_platforms_idx++)
     {
+      if (user_options->machine_readable == true)
+      {
+        printf ("{ ");
+      }
+
       char     *opencl_platform_vendor       = opencl_platforms_vendor[opencl_platforms_idx];
       char     *opencl_platform_name         = opencl_platforms_name[opencl_platforms_idx];
       char     *opencl_platform_version      = opencl_platforms_version[opencl_platforms_idx];
       cl_uint   opencl_platform_devices_cnt  = opencl_platforms_devices_cnt[opencl_platforms_idx];
 
-      event_log_info (hashcat_ctx, "OpenCL Platform ID #%u", opencl_platforms_idx + 1);
-      event_log_info (hashcat_ctx, "  Vendor..: %s",  opencl_platform_vendor);
-      event_log_info (hashcat_ctx, "  Name....: %s",  opencl_platform_name);
-      event_log_info (hashcat_ctx, "  Version.: %s",  opencl_platform_version);
-      event_log_info (hashcat_ctx, NULL);
+      if (user_options->machine_readable == false)
+      {
+        event_log_info (hashcat_ctx, "OpenCL Platform ID #%u", opencl_platforms_idx + 1);
+        event_log_info (hashcat_ctx, "  Vendor..: %s",  opencl_platform_vendor);
+        event_log_info (hashcat_ctx, "  Name....: %s",  opencl_platform_name);
+        event_log_info (hashcat_ctx, "  Version.: %s",  opencl_platform_version);
+        event_log_info (hashcat_ctx, NULL);
+      }
+      else
+      {
+        printf ("\"PlatformID\": \"%u\", ", opencl_platforms_idx + 1);
+        printf ("\"Vendor\": \"%s\", ", opencl_platform_vendor);
+        printf ("\"Name\": \"%s\", ", opencl_platform_name);
+        printf ("\"Version\": \"%s\", ", opencl_platform_version);
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        printf ("\"BackendDevices\": [ ");
+      }
 
       for (cl_uint opencl_platform_devices_idx = 0; opencl_platform_devices_idx < opencl_platform_devices_cnt; opencl_platform_devices_idx++)
       {
+        if (user_options->machine_readable == true)
+        {
+          printf ("{ ");
+        }
+
         const int backend_devices_idx = backend_ctx->backend_device_from_opencl_platform[opencl_platforms_idx][opencl_platform_devices_idx];
 
         const hc_device_param_t *device_param = backend_ctx->devices_param + backend_devices_idx;
@@ -1412,25 +1973,58 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
         if (device_param->device_id_alias_cnt)
         {
-          event_log_info (hashcat_ctx, "  Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Backend Device ID #%02u (Alias: #%02u)", device_id + 1, device_param->device_id_alias_buf[0] + 1);
+          }
+          else
+          {
+            printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+            printf ("\"Alias\": \"%02u\", ", device_param->device_id_alias_buf[0] + 1);
+          }
         }
         else
         {
-          event_log_info (hashcat_ctx, "  Backend Device ID #%02u", device_id + 1);
+          if (user_options->machine_readable == false)
+          {
+            event_log_info (hashcat_ctx, "  Backend Device ID #%02u", device_id + 1);
+          }
+          else
+          {
+            printf ("\"DeviceID\": \"%02u\", ", device_id + 1);
+          }
         }
 
-        event_log_info (hashcat_ctx, "    Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
-        event_log_info (hashcat_ctx, "    Vendor.ID......: %u", opencl_device_vendor_id);
-        event_log_info (hashcat_ctx, "    Vendor.........: %s", opencl_device_vendor);
-        event_log_info (hashcat_ctx, "    Name...........: %s", device_name);
-        event_log_info (hashcat_ctx, "    Version........: %s", opencl_device_version);
-        event_log_info (hashcat_ctx, "    Processor(s)...: %u", device_processors);
-        event_log_info (hashcat_ctx, "    Clock..........: %u", device_maxclock_frequency);
-        event_log_info (hashcat_ctx, "    Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
-        event_log_info (hashcat_ctx, "    Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
-        event_log_info (hashcat_ctx, "    Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
-        event_log_info (hashcat_ctx, "    OpenCL.Version.: %s", opencl_device_c_version);
-        event_log_info (hashcat_ctx, "    Driver.Version.: %s", opencl_driver_version);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, "    Type...........: %s", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+          event_log_info (hashcat_ctx, "    Vendor.ID......: %u", opencl_device_vendor_id);
+          event_log_info (hashcat_ctx, "    Vendor.........: %s", opencl_device_vendor);
+          event_log_info (hashcat_ctx, "    Name...........: %s", device_name);
+          event_log_info (hashcat_ctx, "    Version........: %s", opencl_device_version);
+          event_log_info (hashcat_ctx, "    Processor(s)...: %u", device_processors);
+          event_log_info (hashcat_ctx, "    Clock..........: %u", device_maxclock_frequency);
+          event_log_info (hashcat_ctx, "    Memory.Total...: %" PRIu64 " MB (limited to %" PRIu64 " MB allocatable in one block)", device_global_mem / 1024 / 1024, device_maxmem_alloc / 1024 / 1024);
+          event_log_info (hashcat_ctx, "    Memory.Free....: %" PRIu64 " MB", device_available_mem / 1024 / 1024);
+          event_log_info (hashcat_ctx, "    Local.Memory...: %" PRIu64 " KB", device_local_mem_size / 1024);
+          event_log_info (hashcat_ctx, "    OpenCL.Version.: %s", opencl_device_c_version);
+          event_log_info (hashcat_ctx, "    Driver.Version.: %s", opencl_driver_version);
+        }
+        else
+        {
+          printf ("\"Type\": \"%s\", ", ((opencl_device_type & CL_DEVICE_TYPE_CPU) ? "CPU" : ((opencl_device_type & CL_DEVICE_TYPE_GPU) ? "GPU" : "Accelerator")));
+          printf ("\"VendorID\": \"%u\", ", opencl_device_vendor_id);
+          printf ("\"Vendor\": \"%s\", ", opencl_device_vendor);
+          printf ("\"Name\": \"%s\", ", device_name);
+          printf ("\"Processor(s)\": \"%u\", ", device_processors);
+          printf ("\"Clock\": \"%u\", ", device_maxclock_frequency);
+          printf ("\"MemoryTotal\": \"%" PRIu64 " MB\", ", device_global_mem / 1024 / 1024);
+          printf ("\"MemoryAllocPerBlock\": \"%" PRIu64 " MB\", ", device_maxmem_alloc / 1024 / 1024);
+          printf ("\"MemoryFree\": \"%" PRIu64 " MB\", ", device_available_mem / 1024 / 1024);
+          printf ("\"LocalMemory\": \"%" PRIu64 " MB\", ", device_local_mem_size / 1024);
+          printf ("\"OpenCLVersion\": \"%s\", ", opencl_device_c_version);
+          printf ("\"DriverVersion\": \"%s\" ", opencl_device_version);
+        }
 
         if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
         {
@@ -1440,18 +2034,68 @@ void backend_info (hashcat_ctx_t *hashcat_ctx)
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_AMD) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD))
           {
-            event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            if (user_options->machine_readable == false)
+            {
+              event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            }
+            else
+            {
+              printf (", \"PCI.Addr.BDF\": \"%02x:%02x.%u\" ", pcie_bus, pcie_device, pcie_function);
+            }
           }
 
           if ((device_param->opencl_platform_vendor_id == VENDOR_ID_NV) && (device_param->opencl_device_vendor_id == VENDOR_ID_NV))
           {
-            event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            if (user_options->machine_readable == false)
+            {
+              event_log_info (hashcat_ctx, "    PCI.Addr.BDF...: %02x:%02x.%u", pcie_bus, pcie_device, pcie_function);
+            }
+            else
+            {
+              printf (", \"PCI.Addr.BDF\": \"%02x:%02x.%u\" ", pcie_bus, pcie_device, pcie_function);
+            }
           }
         }
 
-        event_log_info (hashcat_ctx, NULL);
+        if (user_options->machine_readable == false)
+        {
+          event_log_info (hashcat_ctx, NULL);
+        }
+        else
+        {
+          if ((opencl_platform_devices_idx + 1) < opencl_platform_devices_cnt)
+          {
+            printf ("}, ");
+          }
+          else
+          {
+            printf ("} ");
+          }
+        }
+      }
+
+      if (user_options->machine_readable == true)
+      {
+        if ((opencl_platforms_idx + 1) < opencl_platforms_cnt)
+        {
+          printf ("] }, ");
+        }
+        else
+        {
+          printf ("] } ");
+        }
       }
     }
+
+    if (user_options->machine_readable == true)
+    {
+      printf ("] } ");
+    }
+  }
+
+  if (user_options->machine_readable == true)
+  {
+    printf ("}");
   }
 }
 
@@ -2675,18 +3319,10 @@ void status_display (hashcat_ctx_t *hashcat_ctx)
     }
     else
     {
-      event_log_info (hashcat_ctx,
-        "Remaining........: %u (%.2f%%) Digests",
-        digests_remain,
-        digests_remain_percent);
+      event_log_info (hashcat_ctx, "Remaining........: %u (%.2f%%) Digests", digests_remain, digests_remain_percent);
     }
-  }
 
-  if (hashcat_status->digests_cnt > 1000)
-  {
-    event_log_info (hashcat_ctx,
-      "Recovered/Time...: %s",
-      hashcat_status->cpt);
+    event_log_info (hashcat_ctx, "Recovered/Time...: %s", hashcat_status->cpt);
   }
 
   switch (hashcat_status->progress_mode)
@@ -3061,7 +3697,7 @@ void status_speed_json (hashcat_ctx_t *hashcat_ctx)
       printf (",");
     }
 
-    printf (" { \"device_id\": %u,", device_id + 1);
+    printf (" { \"device_id\": %d,", device_id + 1);
     printf (" \"speed\": %" PRIu64 " }", (u64) (device_info->hashes_msec_dev_benchmark * 1000));
     device_num++;
   }
@@ -3178,7 +3814,7 @@ void status_progress_json (hashcat_ctx_t *hashcat_ctx)
       printf (",");
     }
 
-    printf (" { \"device_id\": %u,", device_id + 1);
+    printf (" { \"device_id\": %d,", device_id + 1);
     printf (" \"progress\": %" PRIu64 ",", device_info->progress_dev);
     printf (" \"runtime\": %0.2f }", device_info->runtime_msec_dev);
     device_num++;
diff --git a/src/tuningdb.c b/src/tuningdb.c
index 41d3795aa..2c507feb7 100644
--- a/src/tuningdb.c
+++ b/src/tuningdb.c
@@ -56,8 +56,8 @@ int tuning_db_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
-  if (user_options->hash_info    == true) return 0;
   if (user_options->keyspace     == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->show         == true) return 0;
diff --git a/src/usage.c b/src/usage.c
index 01cb31bfe..94a8a0cff 100644
--- a/src/usage.c
+++ b/src/usage.c
@@ -100,7 +100,7 @@ static const char *const USAGE_BIG_PRE_HASHMODES[] =
   "     --bridge-parameter4        | Str  | Sets the generic parameter 4 for a Bridge            |",
   "     --cpu-affinity             | Str  | Locks to CPU devices, separated with commas          | --cpu-affinity=1,2,3",
   "     --hook-threads             | Num  | Sets number of threads for a hook (per compute unit) | --hook-threads=8",
-  "     --hash-info                |      | Show information for each hash-mode                  |",
+  " -H, --hash-info                |      | Show information for each hash-mode                  | -H or -HH",
   "     --example-hashes           |      | Alias of --hash-info                                 |",
   "     --backend-ignore-cuda      |      | Do not try to open CUDA interface on startup         |",
   "     --backend-ignore-hip       |      | Do not try to open HIP interface on startup          |",
@@ -126,6 +126,7 @@ static const char *const USAGE_BIG_PRE_HASHMODES[] =
   " -s, --skip                     | Num  | Skip X words from the start                          | -s 1000000",
   " -l, --limit                    | Num  | Limit X words from the start + skipped words         | -l 1000000",
   "     --keyspace                 |      | Show keyspace base:mod values and quit               |",
+  "     --total-candidates         |      | Show total candidate count (base*mod) and quit       |",
   " -j, --rule-left                | Rule | Single rule applied to each word from left wordlist  | -j 'c'",
   " -k, --rule-right               | Rule | Single rule applied to each word from right wordlist | -k '^-'",
   " -r, --rules-file               | File | Multiple rules applied to each word from wordlists   | -r rules/best64.rule",
diff --git a/src/user_options.c b/src/user_options.c
index a7d3bc508..cb27a9b47 100644
--- a/src/user_options.c
+++ b/src/user_options.c
@@ -21,9 +21,9 @@
 #endif
 
 #ifdef WITH_BRAIN
-static const char *const short_options = "hVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:R:z";
+static const char *const short_options = "hHVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:R:z";
 #else
-static const char *const short_options = "hVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:R:";
+static const char *const short_options = "hHVvm:a:r:j:k:g:o:t:d:D:n:u:T:c:p:s:l:1:2:3:4:iIbw:OMSY:R:";
 #endif
 
 static char *const SEPARATOR = ":";
@@ -93,6 +93,7 @@ static const struct option long_options[] =
   {"kernel-threads",            required_argument, NULL, IDX_KERNEL_THREADS},
   {"keyboard-layout-mapping",   required_argument, NULL, IDX_KEYBOARD_LAYOUT_MAPPING},
   {"keyspace",                  no_argument,       NULL, IDX_KEYSPACE},
+  {"total-candidates",          no_argument,       NULL, IDX_TOTAL_CANDIDATES},
   {"left",                      no_argument,       NULL, IDX_LEFT},
   {"limit",                     required_argument, NULL, IDX_LIMIT},
   {"logfile-disable",           no_argument,       NULL, IDX_LOGFILE_DISABLE},
@@ -246,6 +247,7 @@ int user_options_init (hashcat_ctx_t *hashcat_ctx)
   user_options->kernel_threads            = KERNEL_THREADS;
   user_options->keyboard_layout_mapping   = NULL;
   user_options->keyspace                  = KEYSPACE;
+  user_options->total_candidates          = TOTAL_CANDIDATES;
   user_options->left                      = LEFT;
   user_options->limit                     = LIMIT;
   user_options->logfile                   = LOGFILE;
@@ -440,7 +442,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
       case IDX_ENCODING_TO:               user_options->encoding_to               = optarg;                          break;
       case IDX_INDUCTION_DIR:             user_options->induction_dir             = optarg;                          break;
       case IDX_OUTFILE_CHECK_DIR:         user_options->outfile_check_dir         = optarg;                          break;
-      case IDX_HASH_INFO:                 user_options->hash_info                 = true;                            break;
+      case IDX_HASH_INFO:                 user_options->hash_info++;                                                 break;
       case IDX_FORCE:                     user_options->force                     = true;                            break;
       case IDX_SELF_TEST_DISABLE:         user_options->self_test                 = false;                           break;
       case IDX_SKIP:                      user_options->skip                      = hc_strtoull (optarg, NULL, 10);
@@ -449,6 +451,7 @@ int user_options_getopt (hashcat_ctx_t *hashcat_ctx, int argc, char **argv)
                                           user_options->limit_chgd                = true;                            break;
       case IDX_KEEP_GUESSING:             user_options->keep_guessing             = true;                            break;
       case IDX_KEYSPACE:                  user_options->keyspace                  = true;                            break;
+      case IDX_TOTAL_CANDIDATES:          user_options->total_candidates          = true;                            break;
       case IDX_BENCHMARK:                 user_options->benchmark                 = true;                            break;
       case IDX_BENCHMARK_ALL:             user_options->benchmark_all             = true;                            break;
       case IDX_BENCHMARK_MAX:             user_options->benchmark_max             = hc_strtoul (optarg, NULL, 10);   break;
@@ -1359,7 +1362,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
       return -1;
     }
 
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
       event_log_error (hashcat_ctx, "Use of --hash-info is not allowed in benchmark mode.");
 
@@ -1551,6 +1554,13 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
     return -1;
   }
 
+  if (user_options->hash_info > 2)
+  {
+    event_log_error (hashcat_ctx, "Invalid --hash-info/-H value, must have a value greater or equal to 0 and lower than 3.");
+
+    return -1;
+  }
+
   #ifdef WITH_BRAIN
   if ((user_options->brain_client == true) && (user_options->remove == true))
   {
@@ -1645,7 +1655,7 @@ int user_options_sanity (hashcat_ctx_t *hashcat_ctx)
       show_error = false;
     }
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
     if (user_options->hc_argc == 0)
     {
@@ -1843,7 +1853,7 @@ void user_options_session_auto (hashcat_ctx_t *hashcat_ctx)
       user_options->session = "benchmark";
     }
 
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
       user_options->session = "hash_info";
     }
@@ -1863,6 +1873,11 @@ void user_options_session_auto (hashcat_ctx_t *hashcat_ctx)
       user_options->session = "progress_only";
     }
 
+    if (user_options->total_candidates == true)
+    {
+      user_options->session = "candidates";
+    }
+
     if (user_options->keyspace == true)
     {
       user_options->session = "keyspace";
@@ -1934,12 +1949,13 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->bitmap_max          = 1;
   }
 
-  if (user_options->hash_info        == true
-   || user_options->keyspace         == true
+  if (user_options->keyspace         == true
+   || user_options->total_candidates == true
    || user_options->speed_only       == true
    || user_options->progress_only    == true
    || user_options->identify         == true
    || user_options->usage             > 0
+   || user_options->hash_info         > 0
    || user_options->backend_info      > 0)
   {
     user_options->hwmon               = false;
@@ -1991,7 +2007,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     }
   }
 
-  if (user_options->hash_info == true)
+  if (user_options->hash_info > 0)
   {
     user_options->quiet = true;
   }
@@ -2006,6 +2022,11 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->speed_only = true;
   }
 
+  if (user_options->total_candidates == true)
+  {
+    user_options->quiet = true;
+  }
+
   if (user_options->keyspace == true)
   {
     user_options->quiet = true;
@@ -2016,6 +2037,11 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
     user_options->backend_vector_width = 1;
   }
 
+  if (user_options->total_candidates == true)
+  {
+    user_options->keyspace = true;
+  }
+
   if (user_options->stdout_flag == true)
   {
     user_options->force                 = true;
@@ -2105,7 +2131,7 @@ void user_options_preprocess (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->attack_mode == ATTACK_MODE_BF)
   {
-    if (user_options->hash_info == true)
+    if (user_options->hash_info > 0)
     {
 
     }
@@ -2394,7 +2420,7 @@ void user_options_extra_init (hashcat_ctx_t *hashcat_ctx)
   {
 
   }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
   {
 
   }
@@ -3374,6 +3400,7 @@ void user_options_logger (hashcat_ctx_t *hashcat_ctx)
   logfile_top_uint   (user_options->kernel_loops);
   logfile_top_uint   (user_options->kernel_threads);
   logfile_top_uint   (user_options->keyspace);
+  logfile_top_uint   (user_options->total_candidates);
   logfile_top_uint   (user_options->left);
   logfile_top_uint   (user_options->logfile);
   logfile_top_uint   (user_options->loopback);
diff --git a/src/wordlist.c b/src/wordlist.c
index 1776b4885..bfe09de89 100644
--- a/src/wordlist.c
+++ b/src/wordlist.c
@@ -697,9 +697,9 @@ int wl_data_init (hashcat_ctx_t *hashcat_ctx)
 
   if (user_options->usage         > 0)    return 0;
   if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;
 
   if (user_options->benchmark    == true) return 0;
-  if (user_options->hash_info    == true) return 0;
   if (user_options->left         == true) return 0;
   if (user_options->version      == true) return 0;
 
diff --git a/tools/install_modules.sh b/tools/install_modules.sh
index 683d20000..948856e34 100755
--- a/tools/install_modules.sh
+++ b/tools/install_modules.sh
@@ -18,6 +18,7 @@ cpan install Authen::Passphrase::LANManager \
              Bitcoin::Crypto::Base58        \
              Compress::Zlib                 \
              Convert::EBCDIC                \
+             Crypt::Argon2                  \
              Crypt::AuthEnc::GCM            \
              Crypt::Camellia                \
              Crypt::CBC                     \
diff --git a/tools/test_modules/m34000.pm b/tools/test_modules/m34000.pm
new file mode 100644
index 000000000..7b2e671ff
--- /dev/null
+++ b/tools/test_modules/m34000.pm
@@ -0,0 +1,79 @@
+#!/usr/bin/env perl
+
+##
+## Author......: See docs/credits.txt
+## License.....: MIT
+##
+
+use strict;
+use warnings;
+
+use MIME::Base64  qw (decode_base64 encode_base64);
+use Crypt::Argon2 qw (argon2_raw);
+
+sub module_constraints { [[0, 256], [32, 32], [-1, -1], [-1, -1], [-1, -1]] }
+
+sub module_generate_hash
+{
+  my $word  = shift;
+  my $salt  = shift;
+  my $sign  = shift // ("argon2d","argon2i","argon2id")[random_number (0, 2)];
+  my $m     = shift // 65536;
+  my $t     = shift // 3;
+  my $p     = shift // 1;
+  my $len   = shift // random_number (1, 2) * 16;
+
+  my $salt_bin = pack ("H*", $salt);
+
+  my $digest_bin = argon2_raw ($sign, $word, $salt_bin, $t, $m . "k", $p, $len);
+
+  my $salt_base64   = encode_base64 ($salt_bin,   ""); $salt_base64   =~ s/=+$//;
+  my $digest_base64 = encode_base64 ($digest_bin, ""); $digest_base64 =~ s/=+$//;
+
+  my $hash = sprintf ('$%s$v=19$m=%d,t=%d,p=%d$%s$%s', $sign, $m, $t, $p, $salt_base64, $digest_base64);
+
+  return $hash;
+}
+
+sub module_verify_hash
+{
+  my $line = shift;
+
+  my $idx = index ($line, ':');
+
+  return unless $idx >= 0;
+
+  my $hash = substr ($line, 0, $idx);
+  my $word = substr ($line, $idx + 1);
+
+  return unless ((substr ($hash, 0,  9) eq '$argon2d$')
+              || (substr ($hash, 0,  9) eq '$argon2i$')
+              || (substr ($hash, 0, 10) eq '$argon2id$'));              
+
+  my (undef, $signature, $version, $config, $salt, $digest) = split '\$', $hash;
+
+  return unless defined $signature;
+  return unless defined $version;
+  return unless defined $config;
+  return unless defined $salt;
+  return unless defined $digest;
+
+  my ($m_config, $t_config, $p_config) = split ("\,", $config);
+
+  return unless ($version eq "v=19");
+
+  my $m = (split ("=", $m_config))[1];
+  my $t = (split ("=", $t_config))[1];
+  my $p = (split ("=", $p_config))[1];
+
+  $salt   = decode_base64 ($salt);
+  $digest = decode_base64 ($digest);
+
+  my $word_packed = pack_if_HEX_notation ($word);
+
+  my $new_hash = module_generate_hash ($word_packed, unpack ("H*", $salt), $signature, $m, $t, $p, length ($digest));
+
+  return ($new_hash, $word);
+}
+
+1;
diff --git a/tools/test_modules/m70000.pm b/tools/test_modules/m70000.pm
index 4b44334ac..72861ae49 100644
--- a/tools/test_modules/m70000.pm
+++ b/tools/test_modules/m70000.pm
@@ -17,7 +17,7 @@ sub module_generate_hash
 {
   my $word  = shift;
   my $salt  = shift;
-  my $m     = shift // 4096;
+  my $m     = shift // 65536;
   my $t     = shift // 3;
   my $p     = shift // 1;
   my $len   = shift // random_number (1, 2) * 16;