Merge branch 'master' into rule_purgeclass

2025-07-31 02:48:50 +00:00 · 2025-07-09 20:01:58 +02:00 · 2025-07-09 20:01:58 +02:00 · d5c34631ab
commit d5c34631ab
parent 07f6147802 5ffbc5edc4
198 changed files with 15033 additions and 2282 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+.DS_Store
+*/.DS_Store
 *.exe
 *.bin
 *.app
--- a/BUILD_WSL.md
+++ b/BUILD_WSL.md
@ -19,7 +19,7 @@ cd win-iconv/
 cmake -D WIN_ICONV_BUILD_EXECUTABLE=OFF -D CMAKE_INSTALL_PREFIX=/opt/win-iconv-64 -D CMAKE_CXX_COMPILER=$(which x86_64-w64-mingw32-g++) -D CMAKE_C_COMPILER=$(which x86_64-w64-mingw32-gcc) -D CMAKE_SYSTEM_NAME=Windows
 sudo make install
 cd ../
-wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst
+wget https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-python-3.12.11-1-any.pkg.tar.zst
 sudo mkdir /opt/win-python
 sudo tar --zstd -xf mingw-w64-x86_64-python-3.12.10-1-any.pkg.tar.zst -C /opt/win-python
 ```
--- a/OpenCL/inc_amp.h
+++ b/OpenCL/inc_amp.h
@ -16,7 +16,7 @@
  GLOBAL_AS   const bf_t          *bfs_buf,    \
  CONSTANT_AS const u32           &combs_mode, \
  CONSTANT_AS const u64           &gid_max,    \
-                    uint           hc_gid [[ thread_position_in_grid ]]
+                    uint3          hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@ -1946,6 +1946,19 @@ DECLSPEC u32 hc_lop_0x96_S (const u32 a, const u32 b, const u32 c)

 #endif

+/**
+ * arithmetic operations
+ */
+
+DECLSPEC u32 hc_umulhi (const u32 x, const u32 y)
+{
+  #if defined IS_CUDA || defined IS_HIP
+  return __umulhi (x, y);
+  #else
+  return h32_from_64_S ((u64) x * (u64) y);
+  #endif
+}
+
 /**
 * pure scalar functions
 */
@ -41405,7 +41418,6 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
  #endif

  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
-
  const int offset_mod_4 = offset & 3;

  const int offset_minus_4 = 4 - offset_mod_4;
@ -41979,6 +41991,582 @@ DECLSPEC void switch_buffer_by_offset_8x4_le_S (PRIVATE_AS u32 *w0, PRIVATE_AS u
      w0[1] = 0;
      w0[0] = 0;
      break;
+
+    case 16:
+      w7[3] = hc_byte_perm_S (w3[2], w3[3], selector);
+      w7[2] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[1] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[0] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w6[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w5[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w4[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 17:
+      w7[3] = hc_byte_perm_S (w3[1], w3[2], selector);
+      w7[2] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[1] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[0] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w6[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w5[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w4[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 18:
+      w7[3] = hc_byte_perm_S (w3[0], w3[1], selector);
+      w7[2] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[1] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[0] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w6[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w5[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w4[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 19:
+      w7[3] = hc_byte_perm_S (w2[3], w3[0], selector);
+      w7[2] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[1] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[0] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w6[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w5[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w4[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 20:
+      w7[3] = hc_byte_perm_S (w2[2], w2[3], selector);
+      w7[2] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[1] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[0] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w6[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w5[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 21:
+      w7[3] = hc_byte_perm_S (w2[1], w2[2], selector);
+      w7[2] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[1] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[0] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w6[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w5[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 22:
+      w7[3] = hc_byte_perm_S (w2[0], w2[1], selector);
+      w7[2] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[1] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[0] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w6[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w5[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 23:
+      w7[3] = hc_byte_perm_S (w1[3], w2[0], selector);
+      w7[2] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[1] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[0] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w6[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w5[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 24:
+      w7[3] = hc_byte_perm_S (w1[2], w1[3], selector);
+      w7[2] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[1] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[0] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w6[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 25:
+      w7[3] = hc_byte_perm_S (w1[1], w1[2], selector);
+      w7[2] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[1] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[0] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w6[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 26:
+      w7[3] = hc_byte_perm_S (w1[0], w1[1], selector);
+      w7[2] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[1] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[0] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w6[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 27:
+      w7[3] = hc_byte_perm_S (w0[3], w1[0], selector);
+      w7[2] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[1] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[0] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w6[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 28:
+      w7[3] = hc_byte_perm_S (w0[2], w0[3], selector);
+      w7[2] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[1] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[0] = hc_byte_perm_S (    0, w0[0], selector);
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 29:
+      w7[3] = hc_byte_perm_S (w0[1], w0[2], selector);
+      w7[2] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[1] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 30:
+      w7[3] = hc_byte_perm_S (w0[0], w0[1], selector);
+      w7[2] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
+
+    case 31:
+      w7[3] = hc_byte_perm_S (    0, w0[0], selector);
+      w7[2] = 0;
+      w7[1] = 0;
+      w7[0] = 0;
+      w6[3] = 0;
+      w6[2] = 0;
+      w6[1] = 0;
+      w6[0] = 0;
+      w5[3] = 0;
+      w5[2] = 0;
+      w5[1] = 0;
+      w5[0] = 0;
+      w4[3] = 0;
+      w4[2] = 0;
+      w4[1] = 0;
+      w4[0] = 0;
+      w3[3] = 0;
+      w3[2] = 0;
+      w3[1] = 0;
+      w3[0] = 0;
+      w2[3] = 0;
+      w2[2] = 0;
+      w2[1] = 0;
+      w2[0] = 0;
+      w1[3] = 0;
+      w1[2] = 0;
+      w1[1] = 0;
+      w1[0] = 0;
+      w0[3] = 0;
+      w0[2] = 0;
+      w0[1] = 0;
+      w0[0] = 0;
+
+      break;
  }
  #endif
 }
--- a/OpenCL/inc_common.h
+++ b/OpenCL/inc_common.h
@ -124,10 +124,10 @@

 #if defined IS_METAL
 #define KERN_ATTR_MAIN_PARAMS                       \
-  uint hc_gid [[ thread_position_in_grid ]],        \
-  uint hc_lid [[ thread_position_in_threadgroup ]], \
-  uint hc_lsz [[ threads_per_threadgroup ]],        \
-  uint hc_bid [[ threadgroup_position_in_grid ]]
+  uint3 hc_gid [[ thread_position_in_grid ]],        \
+  uint3 hc_lid [[ thread_position_in_threadgroup ]], \
+  uint3 hc_lsz [[ threads_per_threadgroup ]],        \
+  uint3 hc_bid [[ threadgroup_position_in_grid ]]
 #endif // IS_METAL

 /*
@ -284,6 +284,10 @@ DECLSPEC u32  hc_bfe_S          (const u32  a, const u32  b, const u32  c);
 DECLSPEC u32x hc_lop_0x96       (const u32x a, const u32x b, const u32x c);
 DECLSPEC u32  hc_lop_0x96_S     (const u32  a, const u32  b, const u32  c);

+// arithmetic operations
+
+DECLSPEC u32  hc_umulhi (const u32 x, const u32 y);
+
 // legacy common code

 DECLSPEC int ffz (const u32 v);
--- a/OpenCL/inc_hash_argon2.cl
+++ b/OpenCL/inc_hash_argon2.cl
@ -0,0 +1,407 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ *
+ * Warp code based on original work by Ondrej Mosnáček
+ */
+
+#include "inc_vendor.h"
+#include "inc_types.h"
+#include "inc_platform.h"
+#include "inc_common.h"
+#include "inc_hash_blake2b.h"
+#include "inc_hash_argon2.h"
+
+DECLSPEC void argon2_initial_block (PRIVATE_AS const u32 *in, const u32 lane, const u32 blocknum, const u32 parallelism, GLOBAL_AS argon2_block_t *blocks)
+{
+  blake2b_ctx_t ctx;
+
+  blake2b_init (&ctx);
+
+  u64 blake_buf[16] = { 0 };
+
+  blake_buf[0] = sizeof(argon2_block_t);
+
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 4);
+  blake2b_update (&ctx, in, 64);
+
+  blake_buf[0] = hl32_to_64 (lane, blocknum);
+
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) blake_buf, 8);
+
+  blake2b_final (&ctx);
+
+  GLOBAL_AS u64 *out = blocks[(blocknum * parallelism) + lane].values;
+
+  out[0] = ctx.h[0];
+  out[1] = ctx.h[1];
+  out[2] = ctx.h[2];
+  out[3] = ctx.h[3];
+
+  for (u32 off = 4; off < 124; off += 4)
+  {
+    for (u32 idx = 0; idx < 8; idx++) blake_buf[idx] = ctx.h[idx];
+
+    blake2b_init (&ctx);
+    blake2b_transform (ctx.h, blake_buf, 64, (u64) BLAKE2B_FINAL);
+
+    out[off + 0] = ctx.h[0];
+    out[off + 1] = ctx.h[1];
+    out[off + 2] = ctx.h[2];
+    out[off + 3] = ctx.h[3];
+  }
+
+  out[124] = ctx.h[4];
+  out[125] = ctx.h[5];
+  out[126] = ctx.h[6];
+  out[127] = ctx.h[7];
+}
+
+DECLSPEC void argon2_initial_hash (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u64 *blockhash)
+{
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  u32 option_input[32] = { 0 };
+
+  option_input[0] = options->parallelism;
+  option_input[1] = options->digest_len;
+  option_input[2] = options->memory_usage_in_kib;
+  option_input[3] = options->iterations;
+  option_input[4] = options->version;
+  option_input[5] = options->type;
+
+  blake2b_update (&ctx, option_input, 24);
+
+  u32 len_input[32] = { 0 };
+
+  len_input[0] = pw->pw_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, pw->i, pw->pw_len);
+
+  len_input[0] = salt->salt_len;
+
+  blake2b_update (&ctx, len_input, 4);
+  blake2b_update_global (&ctx, salt->salt_buf, salt->salt_len);
+
+  len_input[0] = 0;
+
+  blake2b_update (&ctx, len_input, 4); // secret (K)
+  blake2b_update (&ctx, len_input, 4); // associated data (X)
+
+  blake2b_final (&ctx);
+
+  for (u32 idx = 0; idx < 8; idx++) blockhash[idx] = ctx.h[idx];
+}
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt,
+                           PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out)
+{
+  u64 blockhash[16] = { 0 };
+
+  argon2_initial_hash (pw, salt, options, blockhash);
+
+  // Generate the first two blocks of each lane
+  for (u32 lane = 0; lane < options->parallelism; lane++)
+  {
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 0, options->parallelism, out);
+    argon2_initial_block ((PRIVATE_AS u32 *) blockhash, lane, 1, options->parallelism, out);
+  }
+}
+
+// TODO: reconsider 'trunc_mul()'
+DECLSPEC u64 trunc_mul (u64 x, u64 y)
+{
+  const u32 xlo = (u32) x;
+  const u32 ylo = (u32) y;
+  return hl32_to_64_S (hc_umulhi (xlo, ylo), (u32) (xlo * ylo));
+}
+
+DECLSPEC inline u32 argon2_ref_address (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, u32 index, u64 pseudo_random)
+{
+  u32 ref_lane = 0;
+  u32 ref_area = 0;
+  u32 ref_index = 0;
+
+  if ((pos->pass == 0) && (pos->slice == 0))
+  {
+    ref_lane = pos->lane;
+  }
+  else
+  {
+    ref_lane = h32_from_64_S (pseudo_random) % options->parallelism;
+  }
+
+  ref_area  = (pos->pass == 0) ? pos->slice : (ARGON2_SYNC_POINTS - 1);
+
+  ref_area *= options->segment_length;
+
+  if ((ref_lane == pos->lane) || (index == 0))
+  {
+      ref_area += (index - 1);
+  }
+
+  // if ref_area == 0xFFFFFFFF => bug
+
+  const u32 j1 = l32_from_64_S (pseudo_random);
+
+  ref_index = (ref_area - 1 - hc_umulhi (ref_area, hc_umulhi (j1, j1)));
+
+  if (pos->pass > 0)
+  {
+    ref_index += (pos->slice + 1) * options->segment_length;
+
+    if (ref_index >= options->lane_length)
+    {
+      ref_index -= options->lane_length;
+    }
+  }
+
+  return (options->parallelism * ref_index) + ref_lane;
+}
+
+DECLSPEC void swap_u64 (PRIVATE_AS u64 *x, PRIVATE_AS u64 *y)
+{
+  u64 tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+DECLSPEC void transpose_permute_block (u64 R[4], int argon2_thread)
+{
+  if (argon2_thread & 0x08)
+  {
+    swap_u64 (&R[0], &R[2]);
+    swap_u64 (&R[1], &R[3]);
+  }
+  if (argon2_thread & 0x04)
+  {
+    swap_u64 (&R[0], &R[1]);
+    swap_u64 (&R[2], &R[3]);
+  }
+}
+
+DECLSPEC int argon2_shift (int idx, int argon2_thread)
+{
+  const int delta = ((idx & 0x02) << 3) + (idx & 0x01);
+  return (argon2_thread & 0x0e) | (((argon2_thread & 0x11) + delta + 0x0e) & 0x11);
+}
+
+DECLSPEC void argon2_hash_block (u64 R[4], int argon2_thread, LOCAL_AS u64 *shuffle_buf, int argon2_lsz)
+{
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  transpose_permute_block (R, argon2_thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx],  (argon2_thread & 0x1c) | ((argon2_thread + idx) & 0x03), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], ((argon2_thread & 0x1c) | ((argon2_thread - idx) & 0x03)) ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  transpose_permute_block (R, argon2_thread);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_thread ^ (idx << 2), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift (idx, argon2_thread), argon2_thread, argon2_lsz);
+
+  ARGON2_G(R[0], R[1], R[2], R[3]);
+
+  for (u32 idx = 1; idx < 4; idx++) R[idx] = hc__shfl_sync (shuffle_buf, FULL_MASK, R[idx], argon2_shift ((4 - idx), argon2_thread), argon2_thread, argon2_lsz);
+}
+
+DECLSPEC void argon2_next_addresses (PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, PRIVATE_AS u32 *addresses, u32 start_index, u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
+{
+  u64 Z[4] = { 0 };
+
+  u64 tmp[4] = { 0 };
+
+  tmp[0] = 0;
+  tmp[1] = 0;
+  tmp[2] = 0;
+  tmp[3] = 0;
+
+  switch (argon2_thread)
+  {
+    case 0:  Z[0] = pos->pass;                   break;
+    case 1:  Z[0] = pos->lane;                   break;
+    case 2:  Z[0] = pos->slice;                  break;
+    case 3:  Z[0] = options->memory_block_count; break;
+    case 4:  Z[0] = options->iterations;         break;
+    case 5:  Z[0] = options->type;               break;
+    case 6:  Z[0] = (start_index / 128) + 1;     break;
+    default: Z[0] = 0;                           break;
+  }
+
+  tmp[0] = Z[0];
+
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
+
+  Z[0]  ^= tmp[0];
+
+  for (u32 idx = 0; idx < 4; idx++) tmp[idx] = Z[idx];
+
+  argon2_hash_block (Z, argon2_thread, shuffle_buf, argon2_lsz);
+
+  for (u32 idx = 0; idx < 4; idx++) Z[idx]  ^= tmp[idx];
+
+  for (u32 i = 0, index = (start_index + argon2_thread); i < 4; i++, index += THREADS_PER_LANE)
+  {
+    addresses[i] = argon2_ref_address (options, pos, index, Z[i]);
+  }
+
+  // if addresses[0] == 0xFFFFFFFE => bug
+}
+
+DECLSPEC u32 index_u32x4 (const u32 array[4], u32 index)
+{
+  switch (index)
+  {
+    case 0:
+      return array[0];
+    case 1:
+      return array[1];
+    case 2:
+      return array[2];
+    case 3:
+      return array[3];
+  }
+
+  return (u32) -1;
+}
+
+DECLSPEC GLOBAL_AS argon2_block_t *argon2_get_current_block (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, u32 lane, u32 index_in_lane, u64 R[4], u32 argon2_thread)
+{
+  // Apply wrap-around to previous block index if the current block is the first block in the lane
+  const u32 prev_in_lane = (index_in_lane == 0) ? (options->lane_length - 1) : (index_in_lane - 1);
+
+  GLOBAL_AS argon2_block_t *prev_block = &blocks[(prev_in_lane * options->parallelism) + lane];
+
+  for (u32 idx = 0; idx < 4; idx++) R[idx] = prev_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+
+  return &blocks[(index_in_lane * options->parallelism) + lane];
+}
+
+DECLSPEC void argon2_fill_subsegment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, bool indep_addr, const u32 addresses[4],
+                                      u32 start_index, u32 end_index, GLOBAL_AS argon2_block_t *cur_block, u64 R[4], u32 argon2_thread, LOCAL_AS u64 *shuffle_buf, u32 argon2_lsz)
+{
+  for (u32 index = start_index; index < end_index; index++, cur_block += options->parallelism)
+  {
+    u32 ref_address = 0;
+
+    if (indep_addr)
+    {
+      ref_address = index_u32x4 (addresses, (index / THREADS_PER_LANE) % ARGON2_SYNC_POINTS);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, index, argon2_thread, argon2_lsz);
+    }
+    else
+    {
+      ref_address = argon2_ref_address (options, pos, index, R[0]);
+      ref_address = hc__shfl_sync (shuffle_buf, FULL_MASK, ref_address, 0, argon2_thread, argon2_lsz);
+    }
+
+    GLOBAL_AS const argon2_block_t *ref_block = &blocks[ref_address];
+
+    u64 tmp[4] = { 0 };
+
+    // First pass is overwrite, next passes are XOR with previous
+    if ((pos->pass > 0) && (options->version != ARGON2_VERSION_10))
+    {
+      for (u32 idx = 0; idx < 4; idx++) tmp[idx]  = cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+    }
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= ref_block->values[(idx * THREADS_PER_LANE) + argon2_thread];
+
+    for (u32 idx = 0; idx < 4; idx++) tmp[idx] ^= R[idx];
+
+    argon2_hash_block (R, argon2_thread, shuffle_buf, argon2_lsz);
+
+    for (u32 idx = 0; idx < 4; idx++) R[idx]   ^= tmp[idx];
+
+    for (u32 idx = 0; idx < 4; idx++) cur_block->values[(idx * THREADS_PER_LANE) + argon2_thread] = R[idx];
+  }
+}
+
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  // We have already generated the first two blocks of each lane (for the first pass)
+  const u32 skip_blocks   = (pos->pass == 0) && (pos->slice == 0) ? 2 : 0;
+  const u32 index_in_lane = (pos->slice * options->segment_length) + skip_blocks;
+
+  u64 R[4] = { 0 };
+
+  GLOBAL_AS argon2_block_t *cur_block = argon2_get_current_block (blocks, options, pos->lane, index_in_lane, R, argon2_thread);
+
+  if ((options->type == TYPE_I) || ((options->type == TYPE_ID) && (pos->pass == 0) && (pos->slice <= 1)))
+  {
+    for (u32 block_index = 0; block_index < options->segment_length; block_index += 128)
+    {
+      const u32 start_index = (block_index == 0) ? skip_blocks : block_index;
+      const u32 end_index   = MIN(((start_index | 127) + 1), options->segment_length);
+
+      u32 addresses[4] = { 0 };
+
+      argon2_next_addresses (options, pos, addresses, block_index, argon2_thread, shuffle_buf, argon2_lsz);
+      argon2_fill_subsegment (blocks, options, pos, true, addresses, start_index, end_index, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
+
+      cur_block += (end_index - start_index) * options->parallelism;
+    }
+  }
+  else
+  {
+    u32 addresses[4] = { 0 };
+
+    argon2_fill_subsegment (blocks, options, pos, false, addresses, skip_blocks, options->segment_length, cur_block, R, argon2_thread, shuffle_buf, argon2_lsz);
+  }
+}
+
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out)
+{
+  const u32 lane_length = options->lane_length;
+  const u32 lanes = options->parallelism;
+
+  argon2_block_t final_block = { };
+
+  for (u32 l = 0; l < lanes; l++)
+  {
+    for (u32 idx = 0; idx < 128; idx++) final_block.values[idx] ^= blocks[((lane_length - 1) * lanes) + l].values[idx];
+  }
+
+  u32 output_len[32] = { 0 };
+  output_len[0] = options->digest_len;
+
+  blake2b_ctx_t ctx;
+  blake2b_init (&ctx);
+
+  // Override default (0x40) value in BLAKE2b
+  ctx.h[0] ^= 0x40 ^ options->digest_len;
+
+  blake2b_update (&ctx, output_len, 4);
+  blake2b_update (&ctx, (PRIVATE_AS u32 *) final_block.values, sizeof(final_block));
+
+  blake2b_final (&ctx);
+
+  for (uint i = 0, idx = 0; i < (options->digest_len / 4); i += 2, idx += 1)
+  {
+    out [i + 0] = l32_from_64_S (ctx.h[idx]);
+    out [i + 1] = h32_from_64_S (ctx.h[idx]);
+  }
+}
+
+DECLSPEC GLOBAL_AS argon2_block_t *get_argon2_block (PRIVATE_AS const argon2_options_t *options, GLOBAL_AS void *buf, const int idx)
+{
+  GLOBAL_AS u32 *buf32 = (GLOBAL_AS u32 *) buf;
+
+  #ifdef ARGON2_TMP_ELEM
+  return (GLOBAL_AS argon2_block_t *) buf32 + (ARGON2_TMP_ELEM * idx);
+  #else
+  return (GLOBAL_AS argon2_block_t *) buf32 + (options->memory_block_count * idx);
+  #endif
+}
--- a/OpenCL/inc_hash_argon2.h
+++ b/OpenCL/inc_hash_argon2.h
@ -0,0 +1,164 @@
+
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_ARGON2_H
+#define INC_HASH_ARGON2_H
+
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
+#define THREADS_PER_LANE 32
+#define FULL_MASK 0xffffffff
+
+#define BLAKE2B_OUTBYTES 64
+#define ARGON2_SYNC_POINTS 4
+#define ARGON2_ADDRESSES_IN_BLOCK 128
+
+#define TYPE_D  0
+#define TYPE_I  1
+#define TYPE_ID 2
+
+#if defined IS_CUDA
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl_sync ((mask),(var),(srcLane))
+#elif defined IS_HIP
+// attention hard coded 32 warps for hip here
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) __shfl ((var),(srcLane),32)
+#elif defined IS_OPENCL
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) hc__shfl ((shfbuf),(var),(srcLane),(argon2_thread),(argon2_lsz))
+
+#if defined IS_AMD && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  const u32 idx = src_lane << 2;
+
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  const u32 l32r = __builtin_amdgcn_ds_bpermute (idx, l32);
+  const u32 h32r = __builtin_amdgcn_ds_bpermute (idx, h32);
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#elif defined IS_NV && defined IS_GPU
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  u32 l32r;
+  u32 h32r;
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(l32r)
+      : "r"(l32), "r"(src_lane));
+
+  asm("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0;"
+      : "=r"(h32r)
+      : "r"(h32), "r"(src_lane));
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#else
+DECLSPEC u64 hc__shfl (MAYBE_UNUSED LOCAL_AS u64 *shuffle_buf, const u64 var, const int src_lane, const u32 argon2_thread, const u32 argon2_lsz)
+{
+  shuffle_buf[argon2_thread] = var;
+
+  barrier (CLK_LOCAL_MEM_FENCE);
+
+  const u64 out = shuffle_buf[src_lane & (argon2_lsz - 1)];
+
+  barrier (CLK_LOCAL_MEM_FENCE);
+
+  return out;
+}
+#endif
+
+#elif defined IS_METAL
+#define hc__shfl_sync(shfbuf,mask,var,srcLane,argon2_thread,argon2_lsz) simd_shuffle_64 ((var),(srcLane),(argon2_lsz))
+
+DECLSPEC u64 simd_shuffle_64 (const u64 var, const int src_lane, const u32 argon2_lsz)
+{
+  const u32 idx = src_lane & (argon2_lsz - 1);
+
+  const u32 l32 = l32_from_64_S (var);
+  const u32 h32 = h32_from_64_S (var);
+
+  u32 l32r = simd_shuffle (l32, idx);
+  u32 h32r = simd_shuffle (h32, idx);
+
+  const u64 out = hl32_to_64_S (h32r, l32r);
+
+  return out;
+}
+#endif
+
+#define ARGON2_G(a,b,c,d)                \
+{                                        \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot32_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = blake2b_rot24_S (b ^ c);           \
+  a = a + b + 2 * trunc_mul(a, b);       \
+  d = blake2b_rot16_S (d ^ a);           \
+  c = c + d + 2 * trunc_mul(c, d);       \
+  b = hc_rotr64_S (b ^ c, 63);           \
+}
+
+#define ARGON2_P()                       \
+{                                        \
+  ARGON2_G(v[0], v[4], v[8], v[12]);     \
+  ARGON2_G(v[1], v[5], v[9], v[13]);     \
+  ARGON2_G(v[2], v[6], v[10], v[14]);    \
+  ARGON2_G(v[3], v[7], v[11], v[15]);    \
+                                         \
+  ARGON2_G(v[0], v[5], v[10], v[15]);    \
+  ARGON2_G(v[1], v[6], v[11], v[12]);    \
+  ARGON2_G(v[2], v[7], v[8], v[13]);     \
+  ARGON2_G(v[3], v[4], v[9], v[14]);     \
+}
+
+typedef struct argon2_block
+{
+  u64 values[128];
+
+} argon2_block_t;
+
+typedef struct argon2_options
+{
+  u32 type;
+  u32 version;
+
+  u32 iterations;
+  u32 parallelism;
+  u32 memory_usage_in_kib;
+
+  u32 segment_length;
+  u32 lane_length;
+  u32 memory_block_count;
+  u32 digest_len;
+
+} argon2_options_t;
+
+typedef struct argon2_pos
+{
+  u32 pass;
+  u32 slice;
+  u32 lane;
+
+} argon2_pos_t;
+
+DECLSPEC void argon2_init (GLOBAL_AS const pw_t *pw, GLOBAL_AS const salt_t *salt, PRIVATE_AS const argon2_options_t *options, GLOBAL_AS argon2_block_t *out);
+DECLSPEC void argon2_fill_segment (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS const argon2_pos_t *pos, LOCAL_AS u64 *shuffle_buf, const u32 argon2_thread, const u32 argon2_lsz);
+DECLSPEC void argon2_final (GLOBAL_AS argon2_block_t *blocks, PRIVATE_AS const argon2_options_t *options, PRIVATE_AS u32 *out);
+DECLSPEC GLOBAL_AS argon2_block_t *get_argon2_block (PRIVATE_AS const argon2_options_t *options, GLOBAL_AS void *buf, const int idx);
+
+#endif // INC_HASH_ARGON2_H
--- a/OpenCL/inc_hash_blake2b.cl
+++ b/OpenCL/inc_hash_blake2b.cl
@ -409,7 +409,7 @@ DECLSPEC void blake2b_update (PRIVATE_AS blake2b_ctx_t *ctx, PRIVATE_AS const u3
  u32 w6[4];
  u32 w7[4];

-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative

  int pos1;
  int pos4;
@ -499,7 +499,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co
  u32 w6[4];
  u32 w7[4];

-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative

  int pos1;
  int pos4;
@ -580,7 +580,7 @@ DECLSPEC void blake2b_update_global (PRIVATE_AS blake2b_ctx_t *ctx, GLOBAL_AS co

 DECLSPEC void blake2b_final (PRIVATE_AS blake2b_ctx_t *ctx)
 {
-  blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL);
+  blake2b_transform (ctx->h, ctx->m, ctx->len, (u64) BLAKE2B_FINAL);
 }

 DECLSPEC void blake2b_transform_vector (PRIVATE_AS u64x *h, PRIVATE_AS const u64x *m, const u32x len, const u64 f0)
@ -813,7 +813,7 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA
  u32x w6[4];
  u32x w7[4];

-  const int limit = (const int) len - 128; // int type needed, could be negative
+  const int limit = len - 128; // int type needed, could be negative

  int pos1;
  int pos4;
@ -894,5 +894,5 @@ DECLSPEC void blake2b_update_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx, PRIVA

 DECLSPEC void blake2b_final_vector (PRIVATE_AS blake2b_ctx_vector_t *ctx)
 {
-  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL);
+  blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u64) BLAKE2B_FINAL);
 }
--- a/OpenCL/inc_hash_blake2s.cl
+++ b/OpenCL/inc_hash_blake2s.cl
@ -322,7 +322,7 @@ DECLSPEC void blake2s_update (PRIVATE_AS blake2s_ctx_t *ctx, PRIVATE_AS const u3
  u32 w2[4];
  u32 w3[4];

-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative

  int pos1;
  int pos4;
@ -376,7 +376,7 @@ DECLSPEC void blake2s_update_global (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_AS co
  u32 w2[4];
  u32 w3[4];

-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative

  int pos1;
  int pos4;
@ -516,7 +516,7 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
  u32 w2[4];
  u32 w3[4];

-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative

  int pos1;
  int pos4;
@ -597,13 +597,11 @@ DECLSPEC void blake2s_update_global_swap (PRIVATE_AS blake2s_ctx_t *ctx, GLOBAL_
  blake2s_update_64 (ctx, w0, w1, w2, w3, len - (u32) pos1);
 }

-
 DECLSPEC void blake2s_final (PRIVATE_AS blake2s_ctx_t *ctx)
 {
-  blake2s_transform (ctx->h, ctx->m, ctx->len, BLAKE2S_FINAL);
+  blake2s_transform (ctx->h, ctx->m, ctx->len, (u32) BLAKE2S_FINAL);
 }

-
 DECLSPEC void blake2s_hmac_init_64 (PRIVATE_AS blake2s_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3)
 {
  u32 a0[4];
@ -1158,7 +1156,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA
  u32x w2[4];
  u32x w3[4];

-  const int limit = (const int) len - 64; // int type needed, could be negative
+  const int limit = len - 64; // int type needed, could be negative

  int pos1;
  int pos4;
@ -1207,7 +1205,7 @@ DECLSPEC void blake2s_update_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx, PRIVA

 DECLSPEC void blake2s_final_vector (PRIVATE_AS blake2s_ctx_vector_t *ctx)
 {
-  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2S_FINAL);
+  blake2s_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, (u32) BLAKE2S_FINAL);
 }

 DECLSPEC void blake2s_hmac_init_vector_64 (PRIVATE_AS blake2s_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3)
--- a/OpenCL/inc_hash_ripemd320.cl
+++ b/OpenCL/inc_hash_ripemd320.cl
--- a/OpenCL/inc_hash_ripemd320.h
+++ b/OpenCL/inc_hash_ripemd320.h
@ -0,0 +1,147 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#ifndef INC_HASH_RIPEMD320_H
+#define INC_HASH_RIPEMD320_H
+
+#define RIPEMD320_F(x,y,z)    ((x) ^ (y) ^ (z))
+#define RIPEMD320_G(x,y,z)    ((z) ^ ((x) & ((y) ^ (z)))) /* x ? y : z */
+#define RIPEMD320_H(x,y,z)    (((x) | ~(y)) ^ (z))
+#define RIPEMD320_I(x,y,z)    ((y) ^ ((z) & ((x) ^ (y)))) /* z ? x : y */
+#define RIPEMD320_J(x,y,z)    ((x) ^ ((y) | ~(z)))
+
+#ifdef USE_BITSELECT
+#define RIPEMD320_Go(x,y,z)   (bitselect ((z), (y), (x)))
+#define RIPEMD320_Io(x,y,z)   (bitselect ((y), (x), (z)))
+#else
+#define RIPEMD320_Go(x,y,z)   (RIPEMD320_G ((x), (y), (z)))
+#define RIPEMD320_Io(x,y,z)   (RIPEMD320_I ((x), (y), (z)))
+#endif
+
+#define RIPEMD320_STEP_S(f,a,b,c,d,e,x,K,s) \
+{                                           \
+  a += K;                                   \
+  a += x;                                   \
+  a += f (b, c, d);                         \
+  a  = hc_rotl32_S (a, s);                  \
+  a += e;                                   \
+  c  = hc_rotl32_S (c, 10u);                \
+}
+
+#define RIPEMD320_STEP(f,a,b,c,d,e,x,K,s) \
+{                                         \
+  a += make_u32x (K);                     \
+  a += x;                                 \
+  a += f (b, c, d);                       \
+  a  = hc_rotl32 (a, s);                  \
+  a += e;                                 \
+  c  = hc_rotl32 (c, 10u);                \
+}
+
+#define ROTATE_LEFT_WORKAROUND_BUG(a,n) ((a << n) | (a >> (32 - n)))
+
+#define RIPEMD320_STEP_S_WORKAROUND_BUG(f,a,b,c,d,e,x,K,s)  \
+{                                           \
+  a += K;                                   \
+  a += x;                                   \
+  a += f (b, c, d);                         \
+  a  = ROTATE_LEFT_WORKAROUND_BUG (a, s);   \
+  a += e;                                   \
+  c  = hc_rotl32_S (c, 10u);                \
+}
+
+#define RIPEMD320_STEP_WORKAROUND_BUG(f,a,b,c,d,e,x,K,s)  \
+{                                         \
+  a += make_u32x (K);                     \
+  a += x;                                 \
+  a += f (b, c, d);                       \
+  a  = ROTATE_LEFT_WORKAROUND_BUG (a, s); \
+  a += e;                                 \
+  c  = hc_rotl32 (c, 10u);                \
+}
+
+typedef struct ripemd320_ctx
+{
+  u32 h[10];
+
+  u32 w0[4];
+  u32 w1[4];
+  u32 w2[4];
+  u32 w3[4];
+
+  int len;
+
+} ripemd320_ctx_t;
+
+typedef struct ripemd320_hmac_ctx
+{
+  ripemd320_ctx_t ipad;
+  ripemd320_ctx_t opad;
+
+} ripemd320_hmac_ctx_t;
+
+typedef struct ripemd320_ctx_vector
+{
+  u32x h[10];
+
+  u32x w0[4];
+  u32x w1[4];
+  u32x w2[4];
+  u32x w3[4];
+
+  int  len;
+
+} ripemd320_ctx_vector_t;
+
+typedef struct ripemd320_hmac_ctx_vector
+{
+  ripemd320_ctx_vector_t ipad;
+  ripemd320_ctx_vector_t opad;
+
+} ripemd320_hmac_ctx_vector_t;
+
+DECLSPEC void ripemd320_transform (PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3, PRIVATE_AS u32 *digest);
+DECLSPEC void ripemd320_init (PRIVATE_AS ripemd320_ctx_t *ctx);
+DECLSPEC void ripemd320_update_64 (PRIVATE_AS ripemd320_ctx_t *ctx, PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const int len);
+DECLSPEC void ripemd320_update (PRIVATE_AS ripemd320_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_swap (PRIVATE_AS ripemd320_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_utf16le (PRIVATE_AS ripemd320_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_utf16le_swap (PRIVATE_AS ripemd320_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_global (PRIVATE_AS ripemd320_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_global_swap (PRIVATE_AS ripemd320_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_global_utf16le (PRIVATE_AS ripemd320_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_update_global_utf16le_swap (PRIVATE_AS ripemd320_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_final (PRIVATE_AS ripemd320_ctx_t *ctx);
+DECLSPEC void ripemd320_hmac_init_64 (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w0, PRIVATE_AS const u32 *w1, PRIVATE_AS const u32 *w2, PRIVATE_AS const u32 *w3);
+DECLSPEC void ripemd320_hmac_init (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_init_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_init_global (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_init_global_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_64 (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const int len);
+DECLSPEC void ripemd320_hmac_update (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_utf16le (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_utf16le_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, PRIVATE_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_global (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_global_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_global_utf16le (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_update_global_utf16le_swap (PRIVATE_AS ripemd320_hmac_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len);
+DECLSPEC void ripemd320_hmac_final (PRIVATE_AS ripemd320_hmac_ctx_t *ctx);
+DECLSPEC void ripemd320_transform_vector (PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3, PRIVATE_AS u32x *digest);
+DECLSPEC void ripemd320_init_vector (PRIVATE_AS ripemd320_ctx_vector_t *ctx);
+DECLSPEC void ripemd320_init_vector_from_scalar (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS ripemd320_ctx_t *ctx0);
+DECLSPEC void ripemd320_update_vector_64 (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, const int len);
+DECLSPEC void ripemd320_update_vector (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_update_vector_swap (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_update_vector_utf16le (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_update_vector_utf16le_swap (PRIVATE_AS ripemd320_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_final_vector (PRIVATE_AS ripemd320_ctx_vector_t *ctx);
+DECLSPEC void ripemd320_hmac_init_vector_64 (PRIVATE_AS ripemd320_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w0, PRIVATE_AS const u32x *w1, PRIVATE_AS const u32x *w2, PRIVATE_AS const u32x *w3);
+DECLSPEC void ripemd320_hmac_init_vector (PRIVATE_AS ripemd320_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_hmac_update_vector_64 (PRIVATE_AS ripemd320_hmac_ctx_vector_t *ctx, PRIVATE_AS u32x *w0, PRIVATE_AS u32x *w1, PRIVATE_AS u32x *w2, PRIVATE_AS u32x *w3, const int len);
+DECLSPEC void ripemd320_hmac_update_vector (PRIVATE_AS ripemd320_hmac_ctx_vector_t *ctx, PRIVATE_AS const u32x *w, const int len);
+DECLSPEC void ripemd320_hmac_final_vector (PRIVATE_AS ripemd320_hmac_ctx_vector_t *ctx);
+
+#endif // INC_HASH_RIPEMD320_H
--- a/OpenCL/inc_markov.h
+++ b/OpenCL/inc_markov.h
@ -19,7 +19,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_R_MARKOV                \
  GLOBAL_AS         bf_t *pws_buf_r,      \
@ -31,7 +31,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_C_MARKOV                \
  GLOBAL_AS         pw_t *pws_buf,        \
@ -43,7 +43,7 @@
  CONSTANT_AS const u32  &bits14,         \
  CONSTANT_AS const u32  &bits15,         \
  CONSTANT_AS const u64  &gid_max,        \
-                    uint  hc_gid [[ thread_position_in_grid ]]
+                    uint3 hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@ -104,9 +104,19 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
  return atomicOr (p, val);
 }

-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }
+
+  return (size_t) -1;
 }

 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@ -114,15 +124,34 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
  return (blockIdx.x * blockDim.x) + threadIdx.x;
 }

-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
+
+  return (size_t) -1;
 }

-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }
+
+  return (size_t) -1;
 }

 DECLSPEC u32x rotl32 (const u32x a, const int n)
@ -305,9 +334,19 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
  return atomicOr (p, val);
 }

-DECLSPEC size_t get_group_id  (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_group_id (const u32 dimindx)
 {
-  return blockIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockIdx.x;
+    case 1:
+      return blockIdx.y;
+    case 2:
+      return blockIdx.z;
+  }
+
+  return (size_t) -1;
 }

 DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
@ -315,15 +354,34 @@ DECLSPEC size_t get_global_id  (const u32 dimindx __attribute__((unused)))
  return (blockIdx.x * blockDim.x) + threadIdx.x;
 }

-DECLSPEC size_t get_local_id (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_id (const u32 dimindx)
 {
-  return threadIdx.x;
+  switch (dimindx)
+  {
+    case 0:
+      return threadIdx.x;
+    case 1:
+      return threadIdx.y;
+    case 2:
+      return threadIdx.z;
+  }
+
+  return (size_t) -1;
 }

-DECLSPEC size_t get_local_size (const u32 dimindx __attribute__((unused)))
+DECLSPEC size_t get_local_size (const u32 dimindx)
 {
-  // verify
-  return blockDim.x;
+  switch (dimindx)
+  {
+    case 0:
+      return blockDim.x;
+    case 1:
+      return blockDim.y;
+    case 2:
+      return blockDim.z;
+  }
+
+  return (size_t) -1;
 }

 DECLSPEC u32x rotl32 (const u32x a, const int n)
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@ -27,8 +27,9 @@ DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);

 DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_id    (const u32 dimindx __attribute__((unused)));
-DECLSPEC size_t get_local_size  (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
+DECLSPEC size_t get_local_id    (const u32 dimindx);
+DECLSPEC size_t get_local_size  (const u32 dimindx);

 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
@ -48,7 +49,8 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);

-DECLSPEC size_t get_global_id   (const u32 dimindx);
+DECLSPEC size_t get_global_id   (const u32 dimindx __attribute__((unused)));
+DECLSPEC size_t get_group_id    (const u32 dimindx);
 DECLSPEC size_t get_local_id    (const u32 dimindx);
 DECLSPEC size_t get_local_size  (const u32 dimindx);

@ -71,10 +73,25 @@ DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_or  (volatile GLOBAL_AS u32 *p, volatile const u32 val);

-#define get_global_id(param) hc_gid
-#define get_local_id(param) hc_lid
-#define get_local_size(param) hc_lsz
-#define get_group_id(param) hc_bid
+#define get_global_id(dimindx)   \
+  ((dimindx) == 0 ? hc_gid.x :   \
+   (dimindx) == 1 ? hc_gid.y :   \
+   (dimindx) == 2 ? hc_gid.z : -1)
+
+#define get_group_id(dimindx)    \
+  ((dimindx) == 0 ? hc_bid.x :   \
+   (dimindx) == 1 ? hc_bid.y :   \
+   (dimindx) == 2 ? hc_bid.z : -1)
+
+#define get_local_id(dimindx)    \
+  ((dimindx) == 0 ? hc_lid.x :   \
+   (dimindx) == 1 ? hc_lid.y :   \
+   (dimindx) == 2 ? hc_lid.z : -1)
+
+#define get_local_size(dimindx)  \
+  ((dimindx) == 0 ? hc_lsz.x :   \
+   (dimindx) == 1 ? hc_lsz.y :   \
+   (dimindx) == 2 ? hc_lsz.z : -1)

 DECLSPEC u32x rotl32   (const u32x a, const int n);
 DECLSPEC u32x rotr32   (const u32x a, const int n);
--- a/OpenCL/inc_shared.h
+++ b/OpenCL/inc_shared.h
@ -13,28 +13,28 @@
  GLOBAL_AS         u32      *pws_comp, \
  GLOBAL_AS         pw_t     *pws_buf,  \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_MEMSET            \
  GLOBAL_AS         uint4    *buf,      \
  CONSTANT_AS const u32      &value,    \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_BZERO             \
  GLOBAL_AS         uint4    *buf,      \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_ATINIT            \
  GLOBAL_AS         pw_t     *buf,      \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #define KERN_ATTR_GPU_UTF8_TO_UTF16     \
  GLOBAL_AS         pw_t     *pws_buf,  \
  CONSTANT_AS const u64      &gid_max,  \
-                    uint      hc_gid [[ thread_position_in_grid ]]
+                    uint3     hc_gid [[ thread_position_in_grid ]]

 #else // CUDA, HIP, OpenCL

--- a/OpenCL/inc_types.h
+++ b/OpenCL/inc_types.h
@ -11,11 +11,13 @@
 #define BITMAP_SHIFT1       kernel_param->bitmap_shift1
 #define BITMAP_SHIFT2       kernel_param->bitmap_shift2
 #define SALT_POS_HOST       (kernel_param->pws_pos + gid)
+#define SALT_POS_HOST_BID   (kernel_param->pws_pos + bid)
 #define LOOP_POS            kernel_param->loop_pos
 #define LOOP_CNT            kernel_param->loop_cnt
 #define IL_CNT              kernel_param->il_cnt
 #define DIGESTS_CNT         1
-#define DIGESTS_OFFSET_HOST (kernel_param->pws_pos + gid)
+#define DIGESTS_OFFSET_HOST     (kernel_param->pws_pos + gid)
+#define DIGESTS_OFFSET_HOST_BID (kernel_param->pws_pos + bid)
 #define COMBS_MODE          kernel_param->combs_mode
 #define SALT_REPEAT         kernel_param->salt_repeat
 #define PWS_POS             kernel_param->pws_pos
@ -25,11 +27,13 @@
 #define BITMAP_SHIFT1       kernel_param->bitmap_shift1
 #define BITMAP_SHIFT2       kernel_param->bitmap_shift2
 #define SALT_POS_HOST       kernel_param->salt_pos_host
+#define SALT_POS_HOST_BID   SALT_POS_HOST
 #define LOOP_POS            kernel_param->loop_pos
 #define LOOP_CNT            kernel_param->loop_cnt
 #define IL_CNT              kernel_param->il_cnt
 #define DIGESTS_CNT         kernel_param->digests_cnt
 #define DIGESTS_OFFSET_HOST kernel_param->digests_offset_host
+#define DIGESTS_OFFSET_HOST_BID DIGESTS_OFFSET_HOST
 #define COMBS_MODE          kernel_param->combs_mode
 #define SALT_REPEAT         kernel_param->salt_repeat
 #define PWS_POS             kernel_param->pws_pos
@ -1565,6 +1569,202 @@ typedef enum ripemd160_constants

 } ripemd160_constants_t;

+typedef enum ripemd320_constants
+{
+  RIPEMD320M_A=0x67452301U,
+  RIPEMD320M_B=0xefcdab89U,
+  RIPEMD320M_C=0x98badcfeU,
+  RIPEMD320M_D=0x10325476U,
+  RIPEMD320M_E=0xc3d2e1f0U,
+  RIPEMD320M_F=0x76543210U,
+  RIPEMD320M_G=0xfedcba98U,
+  RIPEMD320M_H=0x89abcdefU,
+  RIPEMD320M_I=0x01234567U,
+  RIPEMD320M_L=0x3c2d1e0fU,
+
+  RIPEMD320C00=0x00000000U,
+  RIPEMD320C10=0x5a827999U,
+  RIPEMD320C20=0x6ed9eba1U,
+  RIPEMD320C30=0x8f1bbcdcU,
+  RIPEMD320C40=0xa953fd4eU,
+  RIPEMD320C50=0x50a28be6U,
+  RIPEMD320C60=0x5c4dd124U,
+  RIPEMD320C70=0x6d703ef3U,
+  RIPEMD320C80=0x7a6d76e9U,
+  RIPEMD320C90=0x00000000U,
+
+  RIPEMD320S00=11,
+  RIPEMD320S01=14,
+  RIPEMD320S02=15,
+  RIPEMD320S03=12,
+  RIPEMD320S04=5,
+  RIPEMD320S05=8,
+  RIPEMD320S06=7,
+  RIPEMD320S07=9,
+  RIPEMD320S08=11,
+  RIPEMD320S09=13,
+  RIPEMD320S0A=14,
+  RIPEMD320S0B=15,
+  RIPEMD320S0C=6,
+  RIPEMD320S0D=7,
+  RIPEMD320S0E=9,
+  RIPEMD320S0F=8,
+
+  RIPEMD320S10=7,
+  RIPEMD320S11=6,
+  RIPEMD320S12=8,
+  RIPEMD320S13=13,
+  RIPEMD320S14=11,
+  RIPEMD320S15=9,
+  RIPEMD320S16=7,
+  RIPEMD320S17=15,
+  RIPEMD320S18=7,
+  RIPEMD320S19=12,
+  RIPEMD320S1A=15,
+  RIPEMD320S1B=9,
+  RIPEMD320S1C=11,
+  RIPEMD320S1D=7,
+  RIPEMD320S1E=13,
+  RIPEMD320S1F=12,
+
+  RIPEMD320S20=11,
+  RIPEMD320S21=13,
+  RIPEMD320S22=6,
+  RIPEMD320S23=7,
+  RIPEMD320S24=14,
+  RIPEMD320S25=9,
+  RIPEMD320S26=13,
+  RIPEMD320S27=15,
+  RIPEMD320S28=14,
+  RIPEMD320S29=8,
+  RIPEMD320S2A=13,
+  RIPEMD320S2B=6,
+  RIPEMD320S2C=5,
+  RIPEMD320S2D=12,
+  RIPEMD320S2E=7,
+  RIPEMD320S2F=5,
+
+  RIPEMD320S30=11,
+  RIPEMD320S31=12,
+  RIPEMD320S32=14,
+  RIPEMD320S33=15,
+  RIPEMD320S34=14,
+  RIPEMD320S35=15,
+  RIPEMD320S36=9,
+  RIPEMD320S37=8,
+  RIPEMD320S38=9,
+  RIPEMD320S39=14,
+  RIPEMD320S3A=5,
+  RIPEMD320S3B=6,
+  RIPEMD320S3C=8,
+  RIPEMD320S3D=6,
+  RIPEMD320S3E=5,
+  RIPEMD320S3F=12,
+
+  RIPEMD320S40=9,
+  RIPEMD320S41=15,
+  RIPEMD320S42=5,
+  RIPEMD320S43=11,
+  RIPEMD320S44=6,
+  RIPEMD320S45=8,
+  RIPEMD320S46=13,
+  RIPEMD320S47=12,
+  RIPEMD320S48=5,
+  RIPEMD320S49=12,
+  RIPEMD320S4A=13,
+  RIPEMD320S4B=14,
+  RIPEMD320S4C=11,
+  RIPEMD320S4D=8,
+  RIPEMD320S4E=5,
+  RIPEMD320S4F=6,
+
+  RIPEMD320S50=8,
+  RIPEMD320S51=9,
+  RIPEMD320S52=9,
+  RIPEMD320S53=11,
+  RIPEMD320S54=13,
+  RIPEMD320S55=15,
+  RIPEMD320S56=15,
+  RIPEMD320S57=5,
+  RIPEMD320S58=7,
+  RIPEMD320S59=7,
+  RIPEMD320S5A=8,
+  RIPEMD320S5B=11,
+  RIPEMD320S5C=14,
+  RIPEMD320S5D=14,
+  RIPEMD320S5E=12,
+  RIPEMD320S5F=6,
+
+  RIPEMD320S60=9,
+  RIPEMD320S61=13,
+  RIPEMD320S62=15,
+  RIPEMD320S63=7,
+  RIPEMD320S64=12,
+  RIPEMD320S65=8,
+  RIPEMD320S66=9,
+  RIPEMD320S67=11,
+  RIPEMD320S68=7,
+  RIPEMD320S69=7,
+  RIPEMD320S6A=12,
+  RIPEMD320S6B=7,
+  RIPEMD320S6C=6,
+  RIPEMD320S6D=15,
+  RIPEMD320S6E=13,
+  RIPEMD320S6F=11,
+
+  RIPEMD320S70=9,
+  RIPEMD320S71=7,
+  RIPEMD320S72=15,
+  RIPEMD320S73=11,
+  RIPEMD320S74=8,
+  RIPEMD320S75=6,
+  RIPEMD320S76=6,
+  RIPEMD320S77=14,
+  RIPEMD320S78=12,
+  RIPEMD320S79=13,
+  RIPEMD320S7A=5,
+  RIPEMD320S7B=14,
+  RIPEMD320S7C=13,
+  RIPEMD320S7D=13,
+  RIPEMD320S7E=7,
+  RIPEMD320S7F=5,
+
+  RIPEMD320S80=15,
+  RIPEMD320S81=5,
+  RIPEMD320S82=8,
+  RIPEMD320S83=11,
+  RIPEMD320S84=14,
+  RIPEMD320S85=14,
+  RIPEMD320S86=6,
+  RIPEMD320S87=14,
+  RIPEMD320S88=6,
+  RIPEMD320S89=9,
+  RIPEMD320S8A=12,
+  RIPEMD320S8B=9,
+  RIPEMD320S8C=12,
+  RIPEMD320S8D=5,
+  RIPEMD320S8E=15,
+  RIPEMD320S8F=8,
+
+  RIPEMD320S90=8,
+  RIPEMD320S91=5,
+  RIPEMD320S92=12,
+  RIPEMD320S93=9,
+  RIPEMD320S94=12,
+  RIPEMD320S95=5,
+  RIPEMD320S96=14,
+  RIPEMD320S97=6,
+  RIPEMD320S98=8,
+  RIPEMD320S99=13,
+  RIPEMD320S9A=6,
+  RIPEMD320S9B=5,
+  RIPEMD320S9C=15,
+  RIPEMD320S9D=13,
+  RIPEMD320S9E=11,
+  RIPEMD320S9F=11
+
+} ripemd320_constants_t;
+
 typedef enum keccak_constants
 {
  KECCAK_RNDC_00=0x0000000000000001UL,
@ -1812,6 +2012,7 @@ typedef struct salt
  u32 salt_len_pc;
  u32 salt_iter;
  u32 salt_iter2;
+  u32 salt_dimy;
  u32 salt_sign[2];
  u32 salt_repeats;

--- a/OpenCL/inc_vendor.h
+++ b/OpenCL/inc_vendor.h
@ -183,13 +183,11 @@ using namespace metal;
 #ifdef IS_CUDA
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif

 #ifdef IS_HIP
 #define USE_BITSELECT
 #define USE_ROTATE
-#define USE_FUNNELSHIFT
 #endif

 #ifdef IS_ROCM
@ -220,7 +218,12 @@ using namespace metal;
 #define s3 w
 #endif

+#if HAS_SHFW == 1
+#define USE_FUNNELSHIFT
+#endif
+
 // some algorithms do not like this, eg 150, 1100, ...
+
 #ifdef NO_FUNNELSHIFT
 #undef USE_FUNNELSHIFT
 #endif
--- a/OpenCL/m15400_a3-optimized.cl
+++ b/OpenCL/m15400_a3-optimized.cl
@ -238,12 +238,162 @@ DECLSPEC void chacha20_transform (PRIVATE_AS const u32x *w0, PRIVATE_AS const u3

 KERNEL_FQ KERNEL_FA void m15400_m04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }

 KERNEL_FQ KERNEL_FA void m15400_m08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
 }

 KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
@ -328,12 +478,187 @@ KERNEL_FQ KERNEL_FA void m15400_m16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))

 KERNEL_FQ KERNEL_FA void m15400_s04 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }

+
 KERNEL_FQ KERNEL_FA void m15400_s08 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
 {
-  // fixed size 32
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+  u32 w1[4];
+
+  w0[0] = pws[gid].i[0];
+  w0[1] = pws[gid].i[1];
+  w0[2] = pws[gid].i[2];
+  w0[3] = pws[gid].i[3];
+  w1[0] = pws[gid].i[4];
+  w1[1] = pws[gid].i[5];
+  w1[2] = pws[gid].i[6];
+  w1[3] = pws[gid].i[7];
+
+  /**
+   * Salt prep
+   */
+
+  u32 iv[2];
+
+  iv[0] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[0];
+  iv[1] = esalt_bufs[DIGESTS_OFFSET_HOST].iv[1];
+
+  u32 plain[2];
+
+  plain[0] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[0];
+  plain[1] = esalt_bufs[DIGESTS_OFFSET_HOST].plain[1];
+
+  u32 position[2];
+
+  position[0] = esalt_bufs[DIGESTS_OFFSET_HOST].position[0];
+  position[1] = esalt_bufs[DIGESTS_OFFSET_HOST].position[1];
+
+  u32 offset = esalt_bufs[DIGESTS_OFFSET_HOST].offset;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = pws[gid].i[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+    const u32x w0x = w0l | w0r;
+
+    u32x w0_t[4];
+    u32x w1_t[4];
+
+    w0_t[0] = w0x;
+    w0_t[1] = w0[1];
+    w0_t[2] = w0[2];
+    w0_t[3] = w0[3];
+    w1_t[0] = w1[0];
+    w1_t[1] = w1[1];
+    w1_t[2] = w1[2];
+    w1_t[3] = w1[3];
+
+    u32x digest[4] = { 0 };
+
+    chacha20_transform (w0_t, w1_t, position, offset, iv, plain, digest);
+
+    const u32x r0 = digest[0];
+    const u32x r1 = digest[1];
+    const u32x r2 = digest[2];
+    const u32x r3 = digest[3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
 }

 KERNEL_FQ KERNEL_FA void m15400_s16 (KERN_ATTR_VECTOR_ESALT (chacha20_t))
--- a/OpenCL/m18100_a0-pure.cl
+++ b/OpenCL/m18100_a0-pure.cl
@ -17,6 +17,47 @@
 #include M2S(INCLUDE_PATH/inc_hash_sha1.cl)
 #endif

+DECLSPEC void _totp_calculate (PRIVATE_AS u32 *code, PRIVATE_AS const u32 *w, const u32 pw_len, PRIVATE_AS const u32 *s, const u32 salt_len)
+{
+  sha1_hmac_ctx_t ctx;
+
+  sha1_hmac_init_swap (&ctx, w, pw_len);
+
+  sha1_hmac_update (&ctx, s, salt_len);
+
+  sha1_hmac_final (&ctx);
+
+  // initialize a buffer for the otp code
+  u32 otp_code = 0;
+
+  // grab 4 consecutive bytes of the hash, starting at offset
+  switch (ctx.opad.h[4] & 15)
+  {
+    case  0: otp_code = ctx.opad.h[0];                              break;
+    case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+    case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+    case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+    case  4: otp_code = ctx.opad.h[1];                              break;
+    case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+    case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+    case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+    case  8: otp_code = ctx.opad.h[2];                              break;
+    case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+    case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+    case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+    case 12: otp_code = ctx.opad.h[3];                              break;
+    case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+    case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+    case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+  }
+
+  // take only the lower 31 bits
+  otp_code &= 0x7fffffff;
+
+  // we want to generate only 6 digits of code
+  *code = otp_code % 1000000;
+}
+
 KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_RULES ())
 {
  /**
@ -34,63 +75,85 @@ KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_RULES ())

  COPY_PW (pws[gid]);

-  const u32 salt_len = 8;
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;

  u32 s[64] = { 0 };

-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  for (u32 i = 0; i < count; i += 1)
  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
  }

  /**
   * loop
   */

-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  if (count == 1)
  {
-    pw_t tmp = PASTE_PW;
-
-    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
-
-    sha1_hmac_ctx_t ctx;
-
-    sha1_hmac_init_swap (&ctx, tmp.i, tmp.pw_len);
-
-    sha1_hmac_update (&ctx, s, salt_len);
-
-    sha1_hmac_final (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s, 8);
+
+      COMPARE_M_SCALAR (otp_code0, 0, 0, 0);
    }
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;

-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);

-    COMPARE_M_SCALAR (otp_code, 0, 0, 0);
+      u32 otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, 0, 0);
+    }
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+      _totp_calculate (&otp_code2, tmp.i, tmp.pw_len, s + 32, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, otp_code2, 0);
+    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+      _totp_calculate (&otp_code2, tmp.i, tmp.pw_len, s + 32, 8);
+      _totp_calculate (&otp_code3, tmp.i, tmp.pw_len, s + 48, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, otp_code2, otp_code3);
+    }
  }
 }

@ -123,62 +186,108 @@ KERNEL_FQ KERNEL_FA void m18100_sxx (KERN_ATTR_RULES ())

  COPY_PW (pws[gid]);

-  const u32 salt_len = 8;
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;

  u32 s[64] = { 0 };

-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  for (u32 i = 0; i < count; i += 1)
  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
  }

  /**
   * loop
   */

-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  if (count == 1)
  {
-    pw_t tmp = PASTE_PW;
-
-    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
-
-    sha1_hmac_ctx_t ctx;
-
-    sha1_hmac_init_swap (&ctx, tmp.i, tmp.pw_len);
-
-    sha1_hmac_update (&ctx, s, salt_len);
-
-    sha1_hmac_final (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s, 8);
+
+      COMPARE_S_SCALAR (otp_code0, 0, 0, 0);
    }
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;

-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);

-    COMPARE_S_SCALAR (otp_code, 0, 0, 0);
+      u32 otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+
+        COMPARE_S_SCALAR (otp_code0, otp_code1, 0, 0);
+      }
+    }
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+
+        if (otp_code1 == search[1])
+        {
+          _totp_calculate (&otp_code2, tmp.i, tmp.pw_len, s + 32, 8);
+
+          COMPARE_S_SCALAR (otp_code0, otp_code1, otp_code2, 0);
+        }
+      }
+    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      pw_t tmp = PASTE_PW;
+
+      tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+      u32 otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, tmp.i, tmp.pw_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, tmp.i, tmp.pw_len, s + 16, 8);
+
+        if (otp_code1 == search[1])
+        {
+          _totp_calculate (&otp_code2, tmp.i, tmp.pw_len, s + 32, 8);
+
+          if (otp_code2 == search[2])
+          {
+            _totp_calculate (&otp_code3, tmp.i, tmp.pw_len, s + 48, 8);
+
+            COMPARE_S_SCALAR (otp_code0, otp_code1, otp_code2, otp_code3);
+          }
+        }
+      }
+    }
  }
 }
--- a/OpenCL/m18100_a1-pure.cl
+++ b/OpenCL/m18100_a1-pure.cl
@ -14,6 +14,47 @@
 #include M2S(INCLUDE_PATH/inc_hash_sha1.cl)
 #endif

+DECLSPEC void _totp_calculate (PRIVATE_AS u32 *code, PRIVATE_AS const u32 *w, const u32 pw_len, PRIVATE_AS const u32 *s, const u32 salt_len)
+{
+  sha1_hmac_ctx_t ctx;
+
+  sha1_hmac_init (&ctx, w, pw_len);
+
+  sha1_hmac_update (&ctx, s, salt_len);
+
+  sha1_hmac_final (&ctx);
+
+  // initialize a buffer for the otp code
+  u32 otp_code = 0;
+
+  // grab 4 consecutive bytes of the hash, starting at offset
+  switch (ctx.opad.h[4] & 15)
+  {
+    case  0: otp_code = ctx.opad.h[0];                              break;
+    case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+    case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+    case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+    case  4: otp_code = ctx.opad.h[1];                              break;
+    case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+    case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+    case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+    case  8: otp_code = ctx.opad.h[2];                              break;
+    case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+    case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+    case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+    case 12: otp_code = ctx.opad.h[3];                              break;
+    case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+    case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+    case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+  }
+
+  // take only the lower 31 bits
+  otp_code &= 0x7fffffff;
+
+  // we want to generate only 6 digits of code
+  *code = otp_code % 1000000;
+}
+
 KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_BASIC ())
 {
  /**
@ -38,81 +79,157 @@ KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_BASIC ())
    w[idx] = hc_swap32_S (pws[gid].i[idx]);
  }

-  const u32 salt_len = 8;
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;

  u32 s[64] = { 0 };

-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  for (u32 i = 0; i < count; i += 1)
  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
  }

  /**
   * loop
   */

-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  if (count == 1)
  {
-    const u32 comb_len = combs_buf[il_pos].pw_len;
-
-    u32 c[64];
-
-    #ifdef _unroll
-    #pragma unroll
-    #endif
-    for (int idx = 0; idx < 64; idx++)
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s, 8);
+
+      COMPARE_M_SCALAR (otp_code0, 0, 0, 0);
    }
-
-    switch_buffer_by_offset_1x64_be_S (c, pw_len);
-
-    #ifdef _unroll
-    #pragma unroll
-    #endif
-    for (int i = 0; i < 64; i++)
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      c[i] |= w[i];
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s +  0, 8);
+      _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, 0, 0);
    }
-
-    sha1_hmac_ctx_t ctx;
-
-    sha1_hmac_init (&ctx, c, pw_len + comb_len);
-
-    sha1_hmac_update (&ctx, s, salt_len);
-
-    sha1_hmac_final (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s +  0, 8);
+      _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+      _totp_calculate (&otp_code2, c, pw_len + comb_len, s + 32, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, otp_code2, 0);
    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      const u32 comb_len = combs_buf[il_pos].pw_len;

-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
+      u32 c[64];

-    COMPARE_M_SCALAR (otp_code, 0, 0, 0);
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s +  0, 8);
+      _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+      _totp_calculate (&otp_code2, c, pw_len + comb_len, s + 32, 8);
+      _totp_calculate (&otp_code3, c, pw_len + comb_len, s + 48, 8);
+
+      COMPARE_M_SCALAR (otp_code0, otp_code1, otp_code2, otp_code3);
+    }
  }
 }

@ -152,80 +269,180 @@ KERNEL_FQ KERNEL_FA void m18100_sxx (KERN_ATTR_BASIC ())
    w[idx] = hc_swap32_S (pws[gid].i[idx]);
  }

-  const u32 salt_len = 8;
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;

  u32 s[64] = { 0 };

-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  for (u32 i = 0; i < count; i += 1)
  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
  }

  /**
   * loop
   */

-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  if (count == 1)
  {
-    const u32 comb_len = combs_buf[il_pos].pw_len;
-
-    u32 c[64];
-
-    #ifdef _unroll
-    #pragma unroll
-    #endif
-    for (int idx = 0; idx < 64; idx++)
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s, 8);
+
+      COMPARE_S_SCALAR (otp_code0, 0, 0, 0);
    }
-
-    switch_buffer_by_offset_1x64_be_S (c, pw_len);
-
-    #ifdef _unroll
-    #pragma unroll
-    #endif
-    for (int i = 0; i < 64; i++)
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      c[i] |= w[i];
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+
+        COMPARE_S_SCALAR (otp_code0, otp_code1, 0, 0);
+      }
    }
-
-    sha1_hmac_ctx_t ctx;
-
-    sha1_hmac_init (&ctx, c, pw_len + comb_len);
-
-    sha1_hmac_update (&ctx, s, salt_len);
-
-    sha1_hmac_final (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+      const u32 comb_len = combs_buf[il_pos].pw_len;
+
+      u32 c[64];
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+
+        if (otp_code1 == search[1])
+        {
+          _totp_calculate (&otp_code2, c, pw_len + comb_len, s + 32, 8);
+
+          COMPARE_S_SCALAR (otp_code0, otp_code1, otp_code2, 0);
+        }
+      }
    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+    {
+      const u32 comb_len = combs_buf[il_pos].pw_len;

-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
+      u32 c[64];

-    COMPARE_S_SCALAR (otp_code, 0, 0, 0);
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int idx = 0; idx < 64; idx++)
+      {
+        c[idx] = hc_swap32_S (combs_buf[il_pos].i[idx]);
+      }
+
+      switch_buffer_by_offset_1x64_be_S (c, pw_len);
+
+      #ifdef _unroll
+      #pragma unroll
+      #endif
+      for (int i = 0; i < 64; i++)
+      {
+        c[i] |= w[i];
+      }
+
+      u32 otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, c, pw_len + comb_len, s, 8);
+
+      if (otp_code0 == search[0])
+      {
+        _totp_calculate (&otp_code1, c, pw_len + comb_len, s + 16, 8);
+
+        if (otp_code1 == search[1])
+        {
+          _totp_calculate (&otp_code2, c, pw_len + comb_len, s + 32, 8);
+
+          if (otp_code2 == search[2])
+          {
+            _totp_calculate (&otp_code3, c, pw_len + comb_len, s + 48, 8);
+
+            COMPARE_S_SCALAR (otp_code0, otp_code1, otp_code2, otp_code3);
+          }
+        }
+      }
+    }
  }
 }
--- a/OpenCL/m18100_a3-pure.cl
+++ b/OpenCL/m18100_a3-pure.cl
@ -1,205 +1,324 @@
-/**
- * Author......: See docs/credits.txt
- * License.....: MIT
- */
-
-//#define NEW_SIMD_CODE
-
-#ifdef KERNEL_STATIC
-#include M2S(INCLUDE_PATH/inc_vendor.h)
-#include M2S(INCLUDE_PATH/inc_types.h)
-#include M2S(INCLUDE_PATH/inc_platform.cl)
-#include M2S(INCLUDE_PATH/inc_common.cl)
-#include M2S(INCLUDE_PATH/inc_simd.cl)
-#include M2S(INCLUDE_PATH/inc_hash_sha1.cl)
-#endif
-
-KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_VECTOR ())
-{
-  /**
-   * modifier
-   */
-
-  const u64 lid = get_local_id (0);
-  const u64 gid = get_global_id (0);
-
-  if (gid >= GID_CNT) return;
-
-  /**
-   * base
-   */
-
-  const u32 pw_len = pws[gid].pw_len;
-
-  u32x w[64] = { 0 };
-
-  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
-  {
-    w[idx] = pws[gid].i[idx];
-  }
-
-  const u32 salt_len = 8;
-
-  u32x s[64] = { 0 };
-
-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
-  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
-  }
-
-  /**
-   * loop
-   */
-
-  u32x w0l = w[0];
-
-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
-  {
-    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
-
-    const u32x w0 = w0l | w0r;
-
-    w[0] = w0;
-
-    sha1_hmac_ctx_vector_t ctx;
-
-    sha1_hmac_init_vector (&ctx, w, pw_len);
-
-    sha1_hmac_update_vector (&ctx, s, salt_len);
-
-    sha1_hmac_final_vector (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
-    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
-    }
-
-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
-
-    COMPARE_M_SIMD (otp_code, 0, 0, 0);
-  }
-}
-
-KERNEL_FQ KERNEL_FA void m18100_sxx (KERN_ATTR_VECTOR ())
-{
-  /**
-   * modifier
-   */
-
-  const u64 lid = get_local_id (0);
-  const u64 gid = get_global_id (0);
-
-  if (gid >= GID_CNT) return;
-
-  /**
-   * digest
-   */
-
-  const u32 search[4] =
-  {
-    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
-    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
-    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
-    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
-  };
-
-  /**
-   * base
-   */
-
-  const u32 pw_len = pws[gid].pw_len;
-
-  u32x w[64] = { 0 };
-
-  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
-  {
-    w[idx] = pws[gid].i[idx];
-  }
-
-  const u32 salt_len = 8;
-
-  u32x s[64] = { 0 };
-
-  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
-  {
-    s[idx] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[idx]);
-  }
-
-  /**
-   * loop
-   */
-
-  u32x w0l = w[0];
-
-  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
-  {
-    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
-
-    const u32x w0 = w0l | w0r;
-
-    w[0] = w0;
-
-    sha1_hmac_ctx_vector_t ctx;
-
-    sha1_hmac_init_vector (&ctx, w, pw_len);
-
-    sha1_hmac_update_vector (&ctx, s, salt_len);
-
-    sha1_hmac_final_vector (&ctx);
-
-    // initialize a buffer for the otp code
-    u32 otp_code = 0;
-
-    // grab 4 consecutive bytes of the hash, starting at offset
-    switch (ctx.opad.h[4] & 15)
-    {
-      case  0: otp_code = ctx.opad.h[0];                              break;
-      case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
-      case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
-      case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
-      case  4: otp_code = ctx.opad.h[1];                              break;
-      case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
-      case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
-      case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
-      case  8: otp_code = ctx.opad.h[2];                              break;
-      case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
-      case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
-      case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
-      case 12: otp_code = ctx.opad.h[3];                              break;
-      case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
-      case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
-      case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
-    }
-
-    // take only the lower 31 bits
-    otp_code &= 0x7fffffff;
-
-    // we want to generate only 6 digits of code
-    otp_code %= 1000000;
-
-    COMPARE_S_SIMD (otp_code, 0, 0, 0);
-  }
-}
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_sha1.cl)
+#endif
+
+DECLSPEC void _totp_calculate (PRIVATE_AS u32x *code, PRIVATE_AS const u32x *w, const u32 pw_len, PRIVATE_AS const u32x *s, const u32 salt_len)
+{
+  sha1_hmac_ctx_vector_t ctx;
+
+  sha1_hmac_init_vector (&ctx, w, pw_len);
+
+  sha1_hmac_update_vector (&ctx, s, salt_len);
+
+  sha1_hmac_final_vector (&ctx);
+
+  // initialize a buffer for the otp code
+  u32x otp_code = 0;
+
+  // grab 4 consecutive bytes of the hash, starting at offset
+  switch (ctx.opad.h[4] & 15)
+  {
+    case  0: otp_code = ctx.opad.h[0];                              break;
+    case  1: otp_code = ctx.opad.h[0] <<  8 | ctx.opad.h[1] >> 24;  break;
+    case  2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16;  break;
+    case  3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >>  8;  break;
+    case  4: otp_code = ctx.opad.h[1];                              break;
+    case  5: otp_code = ctx.opad.h[1] <<  8 | ctx.opad.h[2] >> 24;  break;
+    case  6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16;  break;
+    case  7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >>  8;  break;
+    case  8: otp_code = ctx.opad.h[2];                              break;
+    case  9: otp_code = ctx.opad.h[2] <<  8 | ctx.opad.h[3] >> 24;  break;
+    case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16;  break;
+    case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >>  8;  break;
+    case 12: otp_code = ctx.opad.h[3];                              break;
+    case 13: otp_code = ctx.opad.h[3] <<  8 | ctx.opad.h[4] >> 24;  break;
+    case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16;  break;
+    case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >>  8;  break;
+  }
+
+  // take only the lower 31 bits
+  otp_code &= 0x7fffffff;
+
+  // we want to generate only 6 digits of code
+  *code = otp_code % 1000000;
+}
+
+KERNEL_FQ KERNEL_FA void m18100_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0; i < count; i += 1)
+  {
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  if (count == 1)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0;
+
+      _totp_calculate (&otp_code0, w, pw_len, s, 8);
+
+      COMPARE_M_SIMD (otp_code0, 0, 0, 0);
+    }
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, w, pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+
+      COMPARE_M_SIMD (otp_code0, otp_code1, 0, 0);
+    }
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, w, pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+      _totp_calculate (&otp_code2, w, pw_len, s + 32, 8);
+
+      COMPARE_M_SIMD (otp_code0, otp_code1, otp_code2, 0);
+    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, w, pw_len, s +  0, 8);
+      _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+      _totp_calculate (&otp_code2, w, pw_len, s + 32, 8);
+      _totp_calculate (&otp_code3, w, pw_len, s + 48, 8);
+
+      COMPARE_M_SIMD (otp_code0, otp_code1, otp_code2, otp_code3);
+    }
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m18100_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 count = salt_bufs[SALT_POS_HOST].salt_len / 16;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0; i < count; i += 1)
+  {
+    s[16 * i + 0] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 0]);
+    s[16 * i + 1] = hc_swap32_S (salt_bufs[SALT_POS_HOST].salt_buf[4 * i + 1]);
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  if (count == 1)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0;
+
+      _totp_calculate (&otp_code0, w, pw_len, s, 8);
+
+      COMPARE_S_SIMD (otp_code0, 0, 0, 0);
+    }
+  }
+  else if (count == 2)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1;
+
+      _totp_calculate (&otp_code0, w, pw_len, s, 8);
+
+      if (MATCHES_ONE_VS(otp_code0, search[0]))
+      {
+        _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+
+        COMPARE_S_SIMD (otp_code0, otp_code1, 0, 0);
+      }
+    }
+  }
+  else if (count == 3)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1, otp_code2;
+
+      _totp_calculate (&otp_code0, w, pw_len, s, 8);
+
+      if (MATCHES_ONE_VS(otp_code0, search[0]))
+      {
+        _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+
+        if (MATCHES_ONE_VS(otp_code1, search[1]))
+        {
+          _totp_calculate (&otp_code2, w, pw_len, s + 32, 8);
+
+          COMPARE_S_SIMD (otp_code0, otp_code1, otp_code2, 0);
+        }
+      }
+    }
+  }
+  else if (count == 4)
+  {
+    for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+    {
+      const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+      const u32x w0 = w0l | w0r;
+
+      w[0] = w0;
+
+      u32x otp_code0, otp_code1, otp_code2, otp_code3;
+
+      _totp_calculate (&otp_code0, w, pw_len, s, 8);
+
+      if (MATCHES_ONE_VS(otp_code0, search[0]))
+      {
+        _totp_calculate (&otp_code1, w, pw_len, s + 16, 8);
+
+        if (MATCHES_ONE_VS(otp_code1, search[1]))
+        {
+          _totp_calculate (&otp_code2, w, pw_len, s + 32, 8);
+
+          if (MATCHES_ONE_VS(otp_code2, search[2]))
+          {
+            _totp_calculate (&otp_code3, w, pw_len, s + 48, 8);
+
+            COMPARE_S_SIMD (otp_code0, otp_code1, otp_code2, otp_code3);
+          }
+        }
+      }
+    }
+  }
+}
--- a/OpenCL/m18600-pure.cl
+++ b/OpenCL/m18600-pure.cl
@ -636,7 +636,7 @@ KERNEL_FQ KERNEL_FA void m18600_loop (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t
  }
 }

-KERNEL_FQ KERNEL_FA void FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
+KERNEL_FQ KERNEL_FA FIXED_THREAD_COUNT(FIXED_LOCAL_SIZE_COMP) void m18600_comp (KERN_ATTR_TMPS_ESALT (odf11_tmp_t, odf11_t))
 {
  const u64 gid = get_global_id (0);
  const u64 lid = get_local_id (0);
--- a/OpenCL/m26610-pure.cl
+++ b/OpenCL/m26610-pure.cl
@ -368,7 +368,7 @@ KERNEL_FQ KERNEL_FA void m26610_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t,

  AES_GCM_decrypt (key, J0, ct, 32, pt, s_te0, s_te1, s_te2, s_te3, s_te4);

-  const int correct = is_valid_printable_32 (pt[0])
+  int correct = is_valid_printable_32 (pt[0])
                    + is_valid_printable_32 (pt[1])
                    + is_valid_printable_32 (pt[2])
                    + is_valid_printable_32 (pt[3])
@ -379,6 +379,37 @@ KERNEL_FQ KERNEL_FA void m26610_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t,

  if (correct != 8) return;

+  u32 ct2[8];
+
+  ct2[0] = pbkdf2_sha256_aes_gcm->ct_buf[8]; // third block of ciphertext
+  ct2[1] = pbkdf2_sha256_aes_gcm->ct_buf[9];
+  ct2[2] = pbkdf2_sha256_aes_gcm->ct_buf[10];
+  ct2[3] = pbkdf2_sha256_aes_gcm->ct_buf[11];
+  ct2[4] = pbkdf2_sha256_aes_gcm->ct_buf[12]; // fourth block of ciphertext
+  ct2[5] = pbkdf2_sha256_aes_gcm->ct_buf[13];
+  ct2[6] = pbkdf2_sha256_aes_gcm->ct_buf[14];
+  ct2[7] = pbkdf2_sha256_aes_gcm->ct_buf[15];
+
+  // Only a single increment as the previous AES_GCM_DECRYPT already does one for us
+  J0[3]++;
+
+  u32 pt2[8] = { 0 };
+
+  AES_GCM_decrypt (key, J0, ct2, 32, pt2, s_te0, s_te1, s_te2, s_te3, s_te4);
+
+  correct = is_valid_printable_32 (pt2[0])
+                    + is_valid_printable_32 (pt2[1])
+                    + is_valid_printable_32 (pt2[2])
+                    + is_valid_printable_32 (pt2[3])
+                    + is_valid_printable_32 (pt2[4])
+                    + is_valid_printable_32 (pt2[5])
+                    + is_valid_printable_32 (pt2[6])
+                    + is_valid_printable_32 (pt2[7]);
+
+  // We need to check a third and fourth block to avoid extremely rare false-positives. See:
+  // https://github.com/hashcat/hashcat/issues/4121
+  if (correct != 8) return;
+
  /*
  const int pt_len = 28; // not using 32 byte but 28 because our UTF8 allows up to 4 byte per character and since we decrypt 32 byte
                         // only we can't guarantee it is not in the middle of a UTF8 byte stream at that point
--- a/OpenCL/m33000_a0-pure.cl
+++ b/OpenCL/m33000_a0-pure.cl
@ -0,0 +1,157 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+typedef struct md5_double_salt
+{
+  u32 salt1_buf[64];
+  int salt1_len;
+
+  u32 salt2_buf[64];
+  int salt2_len;
+
+} md5_double_salt_t;
+
+KERNEL_FQ KERNEL_FA void m33000_mxx (KERN_ATTR_RULES_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32 s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    md5_ctx_t ctx = ctx0;
+
+    md5_update (&ctx, tmp.i, tmp.pw_len);
+
+    md5_update (&ctx, s2, salt2_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33000_sxx (KERN_ATTR_RULES_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32 s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    md5_ctx_t ctx = ctx0;
+
+    md5_update (&ctx, tmp.i, tmp.pw_len);
+
+    md5_update (&ctx, s2, salt2_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33000_a1-pure.cl
+++ b/OpenCL/m33000_a1-pure.cl
@ -0,0 +1,147 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+typedef struct md5_double_salt
+{
+  u32 salt1_buf[64];
+  int salt1_len;
+
+  u32 salt2_buf[64];
+  int salt2_len;
+
+} md5_double_salt_t;
+
+KERNEL_FQ KERNEL_FA void m33000_mxx (KERN_ATTR_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32 s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    md5_ctx_t ctx = ctx0;
+
+    md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    md5_update (&ctx, s2, salt2_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33000_sxx (KERN_ATTR_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32 s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    md5_ctx_t ctx = ctx0;
+
+    md5_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    md5_update (&ctx, s2, salt2_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33000_a3-pure.cl
+++ b/OpenCL/m33000_a3-pure.cl
@ -0,0 +1,181 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+typedef struct md5_double_salt
+{
+  u32 salt1_buf[64];
+  int salt1_len;
+
+  u32 salt2_buf[64];
+  int salt2_len;
+
+} md5_double_salt_t;
+
+KERNEL_FQ KERNEL_FA void m33000_mxx (KERN_ATTR_VECTOR_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32x s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    md5_ctx_vector_t ctx;
+
+    md5_init_vector_from_scalar (&ctx, &ctx0);
+
+    md5_update_vector (&ctx, w, pw_len);
+
+    md5_update_vector (&ctx, s2, salt2_len);
+
+    md5_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33000_sxx (KERN_ATTR_VECTOR_ESALT (md5_double_salt_t))
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const int salt2_len = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_len;
+
+  u32x s2[64] = { 0 };
+
+  for (int i = 0, idx = 0; i < salt2_len; i += 4, idx += 1)
+  {
+    s2[idx] = esalt_bufs[DIGESTS_OFFSET_HOST].salt2_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_buf, esalt_bufs[DIGESTS_OFFSET_HOST].salt1_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    md5_ctx_vector_t ctx;
+
+    md5_init_vector_from_scalar (&ctx, &ctx0);
+
+    md5_update_vector (&ctx, w, pw_len);
+
+    md5_update_vector (&ctx, s2, salt2_len);
+
+    md5_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33100_a0-pure.cl
+++ b/OpenCL/m33100_a0-pure.cl
@ -0,0 +1,277 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+#if   VECT_SIZE == 1
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#elif VECT_SIZE == 16
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#endif
+
+KERNEL_FQ KERNEL_FA void m33100_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    md5_ctx_t ctx0;
+
+    md5_init (&ctx0);
+
+    md5_update (&ctx0, tmp.i, tmp.pw_len);
+
+    md5_final (&ctx0);
+
+    const u32 a = ctx0.h[0];
+    const u32 b = ctx0.h[1];
+    const u32 c = ctx0.h[2];
+    const u32 d = ctx0.h[3];
+
+    md5_ctx_t ctx;
+
+    md5_init (&ctx);
+
+    md5_update (&ctx, s, salt_len);
+
+    u32 w0[4];
+    u32 w1[4];
+    u32 w2[4];
+    u32 w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update (&ctx, s, salt_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33100_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    md5_ctx_t ctx0;
+
+    md5_init (&ctx0);
+
+    md5_update (&ctx0, tmp.i, tmp.pw_len);
+
+    md5_final (&ctx0);
+
+    const u32 a = ctx0.h[0];
+    const u32 b = ctx0.h[1];
+    const u32 c = ctx0.h[2];
+    const u32 d = ctx0.h[3];
+
+    md5_ctx_t ctx;
+
+    md5_init (&ctx);
+
+    md5_update (&ctx, s, salt_len);
+
+    u32 w0[4];
+    u32 w1[4];
+    u32 w2[4];
+    u32 w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update (&ctx, s, salt_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33100_a1-pure.cl
+++ b/OpenCL/m33100_a1-pure.cl
@ -0,0 +1,271 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+#if   VECT_SIZE == 1
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#elif VECT_SIZE == 16
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#endif
+
+KERNEL_FQ KERNEL_FA void m33100_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    md5_ctx_t ctx1 = ctx0;
+
+    md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    md5_final (&ctx1);
+
+    const u32 a = ctx1.h[0];
+    const u32 b = ctx1.h[1];
+    const u32 c = ctx1.h[2];
+    const u32 d = ctx1.h[3];
+
+    md5_ctx_t ctx;
+
+    md5_init (&ctx);
+
+    md5_update (&ctx, s, salt_len);
+
+    u32 w0[4];
+    u32 w1[4];
+    u32 w2[4];
+    u32 w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update (&ctx, s, salt_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33100_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  md5_ctx_t ctx0;
+
+  md5_init (&ctx0);
+
+  md5_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    md5_ctx_t ctx1 = ctx0;
+
+    md5_update_global (&ctx1, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    md5_final (&ctx1);
+
+    const u32 a = ctx1.h[0];
+    const u32 b = ctx1.h[1];
+    const u32 c = ctx1.h[2];
+    const u32 d = ctx1.h[3];
+
+    md5_ctx_t ctx;
+
+    md5_init (&ctx);
+
+    md5_update (&ctx, s, salt_len);
+
+    u32 w0[4];
+    u32 w1[4];
+    u32 w2[4];
+    u32 w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update (&ctx, s, salt_len);
+
+    md5_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33100_a3-pure.cl
+++ b/OpenCL/m33100_a3-pure.cl
@ -0,0 +1,297 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
+#endif
+
+#if   VECT_SIZE == 1
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i)])
+#elif VECT_SIZE == 2
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1])
+#elif VECT_SIZE == 4
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3])
+#elif VECT_SIZE == 8
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7])
+#elif VECT_SIZE == 16
+#define uint_to_hex_lower8(i) make_u32x (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
+#endif
+
+KERNEL_FQ KERNEL_FA void m33100_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0lr = w0l | w0r;
+
+    w[0] = w0lr;
+
+    md5_ctx_vector_t ctx0;
+
+    md5_init_vector (&ctx0);
+
+    md5_update_vector (&ctx0, w, pw_len);
+
+    md5_final_vector (&ctx0);
+
+    const u32x a = ctx0.h[0];
+    const u32x b = ctx0.h[1];
+    const u32x c = ctx0.h[2];
+    const u32x d = ctx0.h[3];
+
+    md5_ctx_vector_t ctx;
+
+    md5_init_vector (&ctx);
+
+    md5_update_vector (&ctx, s, salt_len);
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_vector_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update_vector (&ctx, s, salt_len);
+
+    md5_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33100_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 gid = get_global_id (0);
+  const u64 lid = get_local_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * bin2asc table
+   */
+
+  LOCAL_VK u32 l_bin2asc[256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    const u32 i0 = (i >> 0) & 15;
+    const u32 i1 = (i >> 4) & 15;
+
+    l_bin2asc[i] = ((i0 < 10) ? '0' + i0 : 'a' - 10 + i0) << 8
+                 | ((i1 < 10) ? '0' + i1 : 'a' - 10 + i1) << 0;
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0lr = w0l | w0r;
+
+    w[0] = w0lr;
+
+    md5_ctx_vector_t ctx0;
+
+    md5_init_vector (&ctx0);
+
+    md5_update_vector (&ctx0, w, pw_len);
+
+    md5_final_vector (&ctx0);
+
+    const u32x a = ctx0.h[0];
+    const u32x b = ctx0.h[1];
+    const u32x c = ctx0.h[2];
+    const u32x d = ctx0.h[3];
+
+    md5_ctx_vector_t ctx;
+
+    md5_init_vector (&ctx);
+
+    md5_update_vector (&ctx, s, salt_len);
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = uint_to_hex_lower8 ((a >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((a >>  8) & 255) << 16;
+    w0[1] = uint_to_hex_lower8 ((a >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((a >> 24) & 255) << 16;
+    w0[2] = uint_to_hex_lower8 ((b >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((b >>  8) & 255) << 16;
+    w0[3] = uint_to_hex_lower8 ((b >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((b >> 24) & 255) << 16;
+    w1[0] = uint_to_hex_lower8 ((c >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((c >>  8) & 255) << 16;
+    w1[1] = uint_to_hex_lower8 ((c >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((c >> 24) & 255) << 16;
+    w1[2] = uint_to_hex_lower8 ((d >>  0) & 255) <<  0
+          | uint_to_hex_lower8 ((d >>  8) & 255) << 16;
+    w1[3] = uint_to_hex_lower8 ((d >> 16) & 255) <<  0
+          | uint_to_hex_lower8 ((d >> 24) & 255) << 16;
+    w2[0] = 0;
+    w2[1] = 0;
+    w2[2] = 0;
+    w2[3] = 0;
+    w3[0] = 0;
+    w3[1] = 0;
+    w3[2] = 0;
+    w3[3] = 0;
+
+    md5_update_vector_64 (&ctx, w0, w1, w2, w3, 32);
+
+    md5_update_vector (&ctx, s, salt_len);
+
+    md5_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33600_a0-optimized.cl
+++ b/OpenCL/m33600_a0-optimized.cl
@ -0,0 +1,225 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.h)
+#include M2S(INCLUDE_PATH/inc_rp_optimized.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+DECLSPEC void ripemd320_transform_transport_vector (PRIVATE_AS const u32x *w, PRIVATE_AS u32x *dgst)
+{
+  ripemd320_transform_vector (w + 0, w + 4, w + 8, w + 12, dgst);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m04 (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    append_0x80_2x4_VV (w0, w1, out_len);
+
+    u32x w[16];
+
+    w[ 0] = w0[0];
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = 0;
+    w[ 9] = 0;
+    w[10] = 0;
+    w[11] = 0;
+    w[12] = 0;
+    w[13] = 0;
+    w[14] = out_len * 8;
+    w[15] = 0;
+
+    /**
+     * RipeMD320
+     */
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m16 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s04 (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    u32x w0[4] = { 0 };
+    u32x w1[4] = { 0 };
+    u32x w2[4] = { 0 };
+    u32x w3[4] = { 0 };
+
+    const u32x out_len = apply_rules_vect_optimized (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
+
+    append_0x80_2x4_VV (w0, w1, out_len);
+
+    u32x w[16];
+
+    w[ 0] = w0[0];
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = 0;
+    w[ 9] = 0;
+    w[10] = 0;
+    w[11] = 0;
+    w[12] = 0;
+    w[13] = 0;
+    w[14] = out_len * 8;
+    w[15] = 0;
+
+    /**
+     * RipeMD320
+     */
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s08 (KERN_ATTR_RULES ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s16 (KERN_ATTR_RULES ())
+{
+}
--- a/OpenCL/m33600_a0-pure.cl
+++ b/OpenCL/m33600_a0-pure.cl
@ -0,0 +1,118 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33600_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_ctx_t ctx;
+
+    ripemd320_init (&ctx);
+
+    ripemd320_update (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_ctx_t ctx;
+
+    ripemd320_init (&ctx);
+
+    ripemd320_update (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33600_a1-optimized.cl
+++ b/OpenCL/m33600_a1-optimized.cl
@ -0,0 +1,339 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+DECLSPEC void ripemd320_transform_transport_vector (PRIVATE_AS const u32x *w, PRIVATE_AS u32x *dgst)
+{
+  ripemd320_transform_vector (w + 0, w + 4, w + 8, w + 12, dgst);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * RipeMD320
+     */
+
+    u32x w[16];
+
+    w[ 0] = w0[0];
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = w2[0];
+    w[ 9] = w2[1];
+    w[10] = w2[2];
+    w[11] = w2[3];
+    w[12] = w3[0];
+    w[13] = w3[1];
+    w[14] = pw_len * 8;
+    w[15] = 0;
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m16 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+
+  /**
+   * base
+   */
+
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 pw_buf0[4];
+  u32 pw_buf1[4];
+
+  pw_buf0[0] = pws[gid].i[0];
+  pw_buf0[1] = pws[gid].i[1];
+  pw_buf0[2] = pws[gid].i[2];
+  pw_buf0[3] = pws[gid].i[3];
+  pw_buf1[0] = pws[gid].i[4];
+  pw_buf1[1] = pws[gid].i[5];
+  pw_buf1[2] = pws[gid].i[6];
+  pw_buf1[3] = pws[gid].i[7];
+
+  const u32 pw_l_len = pws[gid].pw_len & 63;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos) & 63;
+
+    const u32x pw_len = (pw_l_len + pw_r_len) & 63;
+
+    /**
+     * concat password candidate
+     */
+
+    u32x wordl0[4] = { 0 };
+    u32x wordl1[4] = { 0 };
+    u32x wordl2[4] = { 0 };
+    u32x wordl3[4] = { 0 };
+
+    wordl0[0] = pw_buf0[0];
+    wordl0[1] = pw_buf0[1];
+    wordl0[2] = pw_buf0[2];
+    wordl0[3] = pw_buf0[3];
+    wordl1[0] = pw_buf1[0];
+    wordl1[1] = pw_buf1[1];
+    wordl1[2] = pw_buf1[2];
+    wordl1[3] = pw_buf1[3];
+
+    u32x wordr0[4] = { 0 };
+    u32x wordr1[4] = { 0 };
+    u32x wordr2[4] = { 0 };
+    u32x wordr3[4] = { 0 };
+
+    wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
+    wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
+    wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
+    wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
+    wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
+    wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
+    wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
+    wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
+
+    if (COMBS_MODE == COMBINATOR_MODE_BASE_LEFT)
+    {
+      switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len);
+    }
+    else
+    {
+      switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len);
+    }
+
+    u32x w0[4];
+    u32x w1[4];
+    u32x w2[4];
+    u32x w3[4];
+
+    w0[0] = wordl0[0] | wordr0[0];
+    w0[1] = wordl0[1] | wordr0[1];
+    w0[2] = wordl0[2] | wordr0[2];
+    w0[3] = wordl0[3] | wordr0[3];
+    w1[0] = wordl1[0] | wordr1[0];
+    w1[1] = wordl1[1] | wordr1[1];
+    w1[2] = wordl1[2] | wordr1[2];
+    w1[3] = wordl1[3] | wordr1[3];
+    w2[0] = wordl2[0] | wordr2[0];
+    w2[1] = wordl2[1] | wordr2[1];
+    w2[2] = wordl2[2] | wordr2[2];
+    w2[3] = wordl2[3] | wordr2[3];
+    w3[0] = wordl3[0] | wordr3[0];
+    w3[1] = wordl3[1] | wordr3[1];
+    w3[2] = wordl3[2] | wordr3[2];
+    w3[3] = wordl3[3] | wordr3[3];
+
+    /**
+     * RipeMD320
+     */
+
+    u32x w[16];
+
+    w[ 0] = w0[0];
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = w2[0];
+    w[ 9] = w2[1];
+    w[10] = w2[2];
+    w[11] = w2[3];
+    w[12] = w3[0];
+    w[13] = w3[1];
+    w[14] = pw_len * 8;
+    w[15] = 0;
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s08 (KERN_ATTR_BASIC ())
+{
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s16 (KERN_ATTR_BASIC ())
+{
+}
--- a/OpenCL/m33600_a1-pure.cl
+++ b/OpenCL/m33600_a1-pure.cl
@ -0,0 +1,112 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33600_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  ripemd320_ctx_t ctx0;
+
+  ripemd320_init (&ctx0);
+
+  ripemd320_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    ripemd320_ctx_t ctx = ctx0;
+
+    ripemd320_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    ripemd320_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  ripemd320_ctx_t ctx0;
+
+  ripemd320_init (&ctx0);
+
+  ripemd320_update_global (&ctx0, pws[gid].i, pws[gid].pw_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    ripemd320_ctx_t ctx = ctx0;
+
+    ripemd320_update_global (&ctx, combs_buf[il_pos].i, combs_buf[il_pos].pw_len);
+
+    ripemd320_final (&ctx);
+
+    const u32 r0 = ctx.h[DGST_R0];
+    const u32 r1 = ctx.h[DGST_R1];
+    const u32 r2 = ctx.h[DGST_R2];
+    const u32 r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33600_a3-optimized.cl
+++ b/OpenCL/m33600_a3-optimized.cl
@ -0,0 +1,447 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+DECLSPEC void ripemd320_transform_transport_vector (PRIVATE_AS const u32x *w, PRIVATE_AS u32x *dgst)
+{
+  ripemd320_transform_vector (w + 0, w + 4, w + 8, w + 12, dgst);
+}
+
+DECLSPEC void m33600m (PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const u32 pw_len, KERN_ATTR_FUNC_BASIC ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * loop
+   */
+
+  u32 w0l = w0[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = ix_create_bft (bfs_buf, il_pos);
+
+    const u32x w0lr = w0l | w0r;
+
+    u32x w[16];
+
+    w[ 0] = w0lr;
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = w2[0];
+    w[ 9] = w2[1];
+    w[10] = w2[2];
+    w[11] = w2[3];
+    w[12] = w3[0];
+    w[13] = w3[1];
+    w[14] = pw_len * 8;
+    w[15] = 0;
+
+    /**
+     * RipeMD320
+     */
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+DECLSPEC void m33600s (PRIVATE_AS u32 *w0, PRIVATE_AS u32 *w1, PRIVATE_AS u32 *w2, PRIVATE_AS u32 *w3, const u32 pw_len, KERN_ATTR_FUNC_BASIC ())
+{
+  /**
+   * modifiers are taken from args
+   */
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * loop
+   */
+
+  u32 w0l = w0[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = ix_create_bft (bfs_buf, il_pos);
+
+    const u32x w0lr = w0l | w0r;
+
+    u32x w[16];
+
+    w[ 0] = w0lr;
+    w[ 1] = w0[1];
+    w[ 2] = w0[2];
+    w[ 3] = w0[3];
+    w[ 4] = w1[0];
+    w[ 5] = w1[1];
+    w[ 6] = w1[2];
+    w[ 7] = w1[3];
+    w[ 8] = w2[0];
+    w[ 9] = w2[1];
+    w[10] = w2[2];
+    w[11] = w2[3];
+    w[12] = w3[0];
+    w[13] = w3[1];
+    w[14] = pw_len * 8;
+    w[15] = 0;
+
+    /**
+     * RipeMD320
+     */
+
+    u32x dgst[10];
+
+    dgst[0] = RIPEMD320M_A;
+    dgst[1] = RIPEMD320M_B;
+    dgst[2] = RIPEMD320M_C;
+    dgst[3] = RIPEMD320M_D;
+    dgst[4] = RIPEMD320M_E;
+    dgst[5] = RIPEMD320M_F;
+    dgst[6] = RIPEMD320M_G;
+    dgst[7] = RIPEMD320M_H;
+    dgst[8] = RIPEMD320M_I;
+    dgst[9] = RIPEMD320M_L;
+
+    ripemd320_transform_transport_vector (w, dgst);
+
+    COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = 0;
+  w1[1] = 0;
+  w1[2] = 0;
+  w1[3] = 0;
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m08 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_m16 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = pws[gid].i[ 8];
+  w2[1] = pws[gid].i[ 9];
+  w2[2] = pws[gid].i[10];
+  w2[3] = pws[gid].i[11];
+
+  u32 w3[4];
+
+  w3[0] = pws[gid].i[12];
+  w3[1] = pws[gid].i[13];
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s04 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = 0;
+  w1[1] = 0;
+  w1[2] = 0;
+  w1[3] = 0;
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s08 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
+
+KERNEL_FQ KERNEL_FA void m33600_s16 (KERN_ATTR_BASIC ())
+{
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  if (gid >= GID_CNT) return;
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4];
+
+  w2[0] = pws[gid].i[ 8];
+  w2[1] = pws[gid].i[ 9];
+  w2[2] = pws[gid].i[10];
+  w2[3] = pws[gid].i[11];
+
+  u32 w3[4];
+
+  w3[0] = pws[gid].i[12];
+  w3[1] = pws[gid].i[13];
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m33600s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
+}
--- a/OpenCL/m33600_a3-pure.cl
+++ b/OpenCL/m33600_a3-pure.cl
@ -0,0 +1,138 @@
+
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33600_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_ctx_vector_t ctx;
+
+    ripemd320_init_vector (&ctx);
+
+    ripemd320_update_vector (&ctx, w, pw_len);
+
+    ripemd320_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33600_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_ctx_vector_t ctx;
+
+    ripemd320_init_vector (&ctx);
+
+    ripemd320_update_vector (&ctx, w, pw_len);
+
+    ripemd320_final_vector (&ctx);
+
+    const u32x r0 = ctx.h[DGST_R0];
+    const u32x r1 = ctx.h[DGST_R1];
+    const u32x r2 = ctx.h[DGST_R2];
+    const u32x r3 = ctx.h[DGST_R3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33650_a0-pure.cl
+++ b/OpenCL/m33650_a0-pure.cl
@ -0,0 +1,135 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33650_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_ctx_t ctx;
+
+    ripemd320_hmac_init (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_update (&ctx, s, salt_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33650_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_ctx_t ctx;
+
+    ripemd320_hmac_init (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_update (&ctx, s, salt_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33650_a1-pure.cl
+++ b/OpenCL/m33650_a1-pure.cl
@ -0,0 +1,183 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33650_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32 w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    const u32 comb_len = combs_buf[il_pos].pw_len;
+
+    u32 c[64];
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int idx = 0; idx < 64; idx++)
+    {
+      c[idx] = combs_buf[il_pos].i[idx];
+    }
+
+    switch_buffer_by_offset_1x64_le_S (c, pw_len);
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int i = 0; i < 64; i++)
+    {
+      c[i] |= w[i];
+    }
+
+    ripemd320_hmac_ctx_t ctx;
+
+    ripemd320_hmac_init (&ctx, c, pw_len + comb_len);
+
+    ripemd320_hmac_update (&ctx, s, salt_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33650_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32 w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    const u32 comb_len = combs_buf[il_pos].pw_len;
+
+    u32 c[64];
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int idx = 0; idx < 64; idx++)
+    {
+      c[idx] = combs_buf[il_pos].i[idx];
+    }
+
+    switch_buffer_by_offset_1x64_le_S (c, pw_len);
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int i = 0; i < 64; i++)
+    {
+      c[i] |= w[i];
+    }
+
+    ripemd320_hmac_ctx_t ctx;
+
+    ripemd320_hmac_init (&ctx, c, pw_len + comb_len);
+
+    ripemd320_hmac_update (&ctx, s, salt_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33650_a3-pure.cl
+++ b/OpenCL/m33650_a3-pure.cl
@ -0,0 +1,155 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33650_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_hmac_ctx_vector_t ctx;
+
+    ripemd320_hmac_init_vector (&ctx, w, pw_len);
+
+    ripemd320_hmac_update_vector (&ctx, s, salt_len);
+
+    ripemd320_hmac_final_vector (&ctx);
+
+    const u32x r0 = ctx.opad.h[DGST_R0];
+    const u32x r1 = ctx.opad.h[DGST_R1];
+    const u32x r2 = ctx.opad.h[DGST_R2];
+    const u32x r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33650_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_hmac_ctx_vector_t ctx;
+
+    ripemd320_hmac_init_vector (&ctx, w, pw_len);
+
+    ripemd320_hmac_update_vector (&ctx, s, salt_len);
+
+    ripemd320_hmac_final_vector (&ctx);
+
+    const u32x r0 = ctx.opad.h[DGST_R0];
+    const u32x r1 = ctx.opad.h[DGST_R1];
+    const u32x r2 = ctx.opad.h[DGST_R2];
+    const u32x r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33660_a0-pure.cl
+++ b/OpenCL/m33660_a0-pure.cl
@ -0,0 +1,139 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_rp.h)
+#include M2S(INCLUDE_PATH/inc_rp.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33660_mxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_t ctx0;
+
+  ripemd320_hmac_init (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_ctx_t ctx = ctx0;
+
+    ripemd320_hmac_update (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33660_sxx (KERN_ATTR_RULES ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  COPY_PW (pws[gid]);
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_t ctx0;
+
+  ripemd320_hmac_init (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    pw_t tmp = PASTE_PW;
+
+    tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_ctx_t ctx = ctx0;
+
+    ripemd320_hmac_update (&ctx, tmp.i, tmp.pw_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33660_a1-pure.cl
+++ b/OpenCL/m33660_a1-pure.cl
@ -0,0 +1,187 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+//#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_scalar.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33660_mxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32 w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_t ctx0;
+
+  ripemd320_hmac_init (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    const u32 comb_len = combs_buf[il_pos].pw_len;
+
+    u32 c[64];
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int idx = 0; idx < 64; idx++)
+    {
+      c[idx] = combs_buf[il_pos].i[idx];
+    }
+
+    switch_buffer_by_offset_1x64_le_S (c, pw_len);
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int i = 0; i < 64; i++)
+    {
+      c[i] |= w[i];
+    }
+
+    ripemd320_hmac_ctx_t ctx = ctx0;
+
+    ripemd320_hmac_update (&ctx, c, pw_len + comb_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SCALAR (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33660_sxx (KERN_ATTR_BASIC ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32 w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32 s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_t ctx0;
+
+  ripemd320_hmac_init (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos++)
+  {
+    const u32 comb_len = combs_buf[il_pos].pw_len;
+
+    u32 c[64];
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int idx = 0; idx < 64; idx++)
+    {
+      c[idx] = combs_buf[il_pos].i[idx];
+    }
+
+    switch_buffer_by_offset_1x64_le_S (c, pw_len);
+
+    #ifdef _unroll
+    #pragma unroll
+    #endif
+    for (int i = 0; i < 64; i++)
+    {
+      c[i] |= w[i];
+    }
+
+    ripemd320_hmac_ctx_t ctx = ctx0;
+
+    ripemd320_hmac_update (&ctx, c, pw_len + comb_len);
+
+    ripemd320_hmac_final (&ctx);
+
+    const u32 r0 = ctx.opad.h[DGST_R0];
+    const u32 r1 = ctx.opad.h[DGST_R1];
+    const u32 r2 = ctx.opad.h[DGST_R2];
+    const u32 r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SCALAR (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m33660_a3-pure.cl
+++ b/OpenCL/m33660_a3-pure.cl
@ -0,0 +1,159 @@
+/**
+ * Author......: See docs/credits.txt
+ * License.....: MIT
+ */
+
+#define NEW_SIMD_CODE
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_vendor.h)
+#include M2S(INCLUDE_PATH/inc_types.h)
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_simd.cl)
+#include M2S(INCLUDE_PATH/inc_hash_ripemd320.cl)
+#endif
+
+KERNEL_FQ KERNEL_FA void m33660_mxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_vector_t ctx0;
+
+  ripemd320_hmac_init_vector (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_hmac_ctx_vector_t ctx = ctx0;
+
+    ripemd320_hmac_update_vector (&ctx, w, pw_len);
+
+    ripemd320_hmac_final_vector (&ctx);
+
+    const u32x r0 = ctx.opad.h[DGST_R0];
+    const u32x r1 = ctx.opad.h[DGST_R1];
+    const u32x r2 = ctx.opad.h[DGST_R2];
+    const u32x r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_M_SIMD (r0, r1, r2, r3);
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m33660_sxx (KERN_ATTR_VECTOR ())
+{
+  /**
+   * modifier
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * digest
+   */
+
+  const u32 search[4] =
+  {
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
+    digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]
+  };
+
+  /**
+   * base
+   */
+
+  const u32 pw_len = pws[gid].pw_len;
+
+  u32x w[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
+  {
+    w[idx] = pws[gid].i[idx];
+  }
+
+  const u32 salt_len = salt_bufs[SALT_POS_HOST].salt_len;
+
+  u32x s[64] = { 0 };
+
+  for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1)
+  {
+    s[idx] = salt_bufs[SALT_POS_HOST].salt_buf[idx];
+  }
+
+  ripemd320_hmac_ctx_vector_t ctx0;
+
+  ripemd320_hmac_init_vector (&ctx0, s, salt_len);
+
+  /**
+   * loop
+   */
+
+  u32x w0l = w[0];
+
+  for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
+  {
+    const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
+
+    const u32x w0 = w0l | w0r;
+
+    w[0] = w0;
+
+    ripemd320_hmac_ctx_vector_t ctx = ctx0;
+
+    ripemd320_hmac_update_vector (&ctx, w, pw_len);
+
+    ripemd320_hmac_final_vector (&ctx);
+
+    const u32x r0 = ctx.opad.h[DGST_R0];
+    const u32x r1 = ctx.opad.h[DGST_R1];
+    const u32x r2 = ctx.opad.h[DGST_R2];
+    const u32x r3 = ctx.opad.h[DGST_R3];
+
+    COMPARE_S_SIMD (r0, r1, r2, r3);
+  }
+}
--- a/OpenCL/m34000-pure.cl
+++ b/OpenCL/m34000-pure.cl
@ -0,0 +1,151 @@
+
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#ifdef KERNEL_STATIC
+#include M2S(INCLUDE_PATH/inc_platform.cl)
+#include M2S(INCLUDE_PATH/inc_common.cl)
+#include M2S(INCLUDE_PATH/inc_hash_blake2b.cl)
+#include M2S(INCLUDE_PATH/inc_hash_argon2.cl)
+#endif
+
+#define COMPARE_S M2S(INCLUDE_PATH/inc_comp_single.cl)
+#define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl)
+
+typedef struct argon2_tmp
+{
+  u32 state[4];
+
+} argon2_tmp_t;
+
+KERNEL_FQ KERNEL_FA void m34000_init (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS void *V;
+
+  switch (gm4)
+  {
+    case 0: V = d_extra0_buf; break;
+    case 1: V = d_extra1_buf; break;
+    case 2: V = d_extra2_buf; break;
+    case 3: V = d_extra3_buf; break;
+  }
+
+  const argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  GLOBAL_AS argon2_block_t *argon2_block = get_argon2_block (&options, V, gd4);
+
+  argon2_init (&pws[gid], &salt_bufs[SALT_POS_HOST], &options, argon2_block);
+}
+
+KERNEL_FQ KERNEL_FA void m34000_loop (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+  const u64 bid = get_group_id (0);
+  const u64 lid = get_local_id (1);
+  const u64 lsz = get_local_size (1);
+
+  if (bid >= GID_CNT) return;
+
+  const u32 argon2_thread = get_local_id (0);
+  const u32 argon2_lsz = get_local_size (0);
+
+  #ifdef ARGON2_PARALLELISM
+  LOCAL_VK u64 shuffle_bufs[ARGON2_PARALLELISM][32];
+  #else
+  LOCAL_VK u64 shuffle_bufs[32][32];
+  #endif
+
+  LOCAL_AS u64 *shuffle_buf = shuffle_bufs[lid];
+
+  SYNC_THREADS();
+
+  const u32 bd4 = bid / 4;
+  const u32 bm4 = bid % 4;
+
+  GLOBAL_AS void *V;
+
+  switch (bm4)
+  {
+    case 0: V = d_extra0_buf; break;
+    case 1: V = d_extra1_buf; break;
+    case 2: V = d_extra2_buf; break;
+    case 3: V = d_extra3_buf; break;
+  }
+
+  argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST_BID];
+
+  #ifdef ARGON2_PARALLELISM
+  options.parallelism = ARGON2_PARALLELISM;
+  #endif
+
+  GLOBAL_AS argon2_block_t *argon2_block = get_argon2_block (&options, V, bd4);
+
+  argon2_pos_t pos;
+
+  pos.pass   = (LOOP_POS / ARGON2_SYNC_POINTS);
+  pos.slice  = (LOOP_POS % ARGON2_SYNC_POINTS);
+
+  for (u32 i = 0; i < LOOP_CNT; i++)
+  {
+    for (pos.lane = lid; pos.lane < options.parallelism; pos.lane += lsz)
+    {
+      argon2_fill_segment (argon2_block, &options, &pos, shuffle_buf, argon2_thread, argon2_lsz);
+    }
+
+    SYNC_THREADS ();
+
+    pos.slice++;
+
+    if (pos.slice == ARGON2_SYNC_POINTS)
+    {
+      pos.slice = 0;
+      pos.pass++;
+    }
+  }
+}
+
+KERNEL_FQ KERNEL_FA void m34000_comp (KERN_ATTR_TMPS_ESALT (argon2_tmp_t, argon2_options_t))
+{
+  const u64 gid = get_global_id (0);
+
+  if (gid >= GID_CNT) return;
+
+  const u32 gd4 = gid / 4;
+  const u32 gm4 = gid % 4;
+
+  GLOBAL_AS void *V;
+
+  switch (gm4)
+  {
+    case 0: V = d_extra0_buf; break;
+    case 1: V = d_extra1_buf; break;
+    case 2: V = d_extra2_buf; break;
+    case 3: V = d_extra3_buf; break;
+  }
+
+  argon2_options_t options = esalt_bufs[DIGESTS_OFFSET_HOST];
+
+  GLOBAL_AS argon2_block_t *argon2_block = get_argon2_block (&options, V, gd4);
+
+  u32 out[8];
+
+  argon2_final (argon2_block, &options, out);
+
+  const u32 r0 = out[0];
+  const u32 r1 = out[1];
+  const u32 r2 = out[2];
+  const u32 r3 = out[3];
+
+  #define il_pos 0
+
+  #include COMPARE_M
+}
--- a/Python/hcshared.py
+++ b/Python/hcshared.py
@ -6,13 +6,14 @@ import sys

 def extract_salts(salts_buf) -> list:
  salts=[]
-  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I 8s I I I I I I I I", salts_buf):
+  for salt_buf, salt_buf_pc, salt_len, salt_len_pc, salt_iter, salt_iter2, salt_dimy, salt_sign, salt_repeats, orig_pos, digests_cnt, digests_done, digests_offset, scrypt_N, scrypt_r, scrypt_p in struct.iter_unpack("256s 256s I I I I I 8s I I I I I I I I", salts_buf):
    salt_buf = salt_buf[0:salt_len]
    salt_buf_pc = salt_buf_pc[0:salt_len_pc]
    salts.append({ "salt_buf":      salt_buf,     \
                   "salt_buf_pc":   salt_buf_pc,  \
                   "salt_iter":     salt_iter,    \
                   "salt_iter2":    salt_iter2,   \
+                   "salt_dimy":     salt_dimy,    \
                   "salt_sign":     salt_sign,    \
                   "salt_repeats":  salt_repeats, \
                   "orig_pos":      orig_pos,     \
--- a/docs/changes.txt
+++ b/docs/changes.txt
@ -27,6 +27,8 @@
 - Added hash-mode: GPG (AES-128/AES-256 (SHA-256($pass)))
 - Added hash-mode: GPG (AES-128/AES-256 (SHA-512($pass)))
 - Added hash-mode: GPG (CAST5 (SHA-1($pass)))
+- Added hash-mode: HMAC-RIPEMD320 (key = $pass)
+- Added hash-mode: HMAC-RIPEMD320 (key = $salt)
 - Added hash-mode: Kerberos 5, etype 17, AS-REP
 - Added hash-mode: Kerberos 5, etype 18, AS-REP
 - Added hash-mode: MetaMask Mobile Wallet
@ -40,6 +42,7 @@
 - Added hash-mode: NetIQ SSPR (SHA-1 with Salt)
 - Added hash-mode: NetIQ SSPR (SHA-256 with Salt)
 - Added hash-mode: NetIQ SSPR (SHA-512 with Salt)
+- Added hash-mode: RIPEMD-320
 - Added hash-mode: RC4 104-bit DropN
 - Added hash-mode: RC4 40-bit DropN
 - Added hash-mode: RC4 72-bit DropN
@ -49,6 +52,7 @@
 - Added hash-mode: bcrypt(sha256($pass))
 - Added hash-mode: HMAC-RIPEMD160 (key = $pass)
 - Added hash-mode: HMAC-RIPEMD160 (key = $salt)
+- Added hash-mode: md5($salt1.$pass.$salt2)
 - Added hash-mode: md5($salt1.sha1($salt2.$pass))
 - Added hash-mode: md5(md5($pass.$salt))
 - Added hash-mode: md5(md5($salt).md5(md5($pass)))
@ -60,6 +64,7 @@
 - Added hash-mode: md5(sha1($salt.$pass))
 - Added hash-mode: sha512(sha512($pass).$salt)
 - Added hash-mode: sha512(sha512_bin($pass).$salt)
+- Added hash-mode: md5($salt.md5($pass).$salt)

 ##
 ## Features
@ -67,7 +72,9 @@

 - Added new feature (-Y) that creates N virtual instances for each device in your system at the cost of N times the device memory consumption
 - Added options --benchmark-min and --benchmark-max to set a hash-mode range to be used during the benchmark
+- Added option --total-candidates to provide the total candidate count for an attack insteda of the internal "--keyspace" value
 - Added option --backend-devices-keepfree to configure X percentage of device memory available to keep free
+- Added display of password length mininum and maximum in the Kernel.Feature status line

 ##
 ## Performance
@ -89,6 +96,9 @@
 - Fixed bug in grep out-of-memory workaround on Unit Test
 - Fixed bug in input_tokenizer when TOKEN_ATTR_FIXED_LENGTH is used and refactor modules
 - Fixed bug in --stdout that caused certain rules to malfunction
+- Fixed bug in --stdout when multiple computing devices are active
+- Fixed bug in Hardware Monitor: prevent disable if ADL fail
+- Fixed race condition in selftest_init on OpenCL with non-blocking write
 - Fixed build failed for 10700 optimized with Apple Metal
 - Fixed build failed for 13772 and 13773 with Apple Metal
 - Fixed build failed for 18400 with Apple Metal
@ -123,8 +133,11 @@
 - Alias Devices: Prevents hashcat, when started with x86_64 emulation on Apple Silicon, from showing the Apple M1 OpenCL CPU as an alias for the Apple M1 Metal GPU
 - Apple Driver: Automatically enable GPU support on Apple OpenCL instead of CPU support
 - Apple Driver: Updated requirements to use Apple OpenCL API to macOS 13.0 - use
+- Backend: Added workaround to get rid of internal runtimes memory leaks
 - Backend: Updated filename chksum format to prevent invalid cache on Apple Silicon when switching arch
+- Backend: Splitting backend_ctx_devices_init into smaller runtime-specific functions
 - Backend Checks: Describe workaround in error message when detecting more than 64 backend devices
+- Backend Info: Added --machine-readable format
 - Brain: Added sanity check and corresponding error message for invalid --brain-port values
 - Dependencies: Added sse2neon v1.8.0 (commit 658eeac)
 - Dependencies: Updated LZMA SDK to 24.09
@ -132,26 +145,38 @@
 - Dependencies: Updated xxHash to 0.8.3 (commit 50f4226)
 - Building: Support building windows binaries on macOS using MinGW
 - Dependencies: Updated OpenCL-Headers to v2024.10.24 (commit 265df85)
+- Documents: Renamed status_code.txt in exit_status_code.txt and added device_status_code.txt
 - Documents: Updated BUILD.md and added BUILD_macOS.md (containing instructions for building windows binaries on macOS)
+- Modules: Added module_unstable_warning for 22500, update module_unstable_warning for 10700
+- HIP Backend: Avoid deprecated functions
 - Modules: Added support for non-zero IVs for -m 6800 (Lastpass). Also added `tools/lastpass2hashcat.py`
 - Modules: Updated module_unstable_warning
 - Open Document Format: Added support for small documents with content length < 1024
 - OpenCL Backend: added workaround to set device_available_memory from CUDA/HIP alias device
+- Selftest: rename selftest function to process_selftest and splitting into 3 smaller functions
 - Status Code: Add specific return code for self-test fail (-11)
 - Scrypt: Increase buffer sizes in module for hash mode 8900 to allow longer scrypt digests
 - Unicode: Update UTF-8 to UTF-16 conversion to match RFC 3629
+- Unit tests: Updated install_modules.sh with Crypt::Argon2
 - User Options: Added error message when mixing --username and --show to warn users of exponential delay
 - MetaMask: update extraction tool to support MetaMask Mobile wallets
 - SecureCRT MasterPassphrase v2: update module, pure kernels and test unit. Add optimized kernels.
 - Metal Backend: added workaround to prevent 'Infinite Loop' bug when build kernels
 - Metal Backend: added workaround to set the true Processor value in Metal devices on Apple Intel
+- Metal Backend: added support to 2D/3D Compute
 - Metal Backend: allow use of devices with Metal if runtime version is >= 200
 - Metal Backend: disable Metal devices only if at least one OpenCL device is active
+- Metal Backend: improved compute workloads calculation
 - Modules: Check UnpackSize to raise false positive with hc_decompress_rar
 - User Options: added --metal-compiler-runtime option
+- User Options: limit --bitmap-max value to 31
+- User Options: assigned -H to --hash-info
+- Hash-Info: show more details using -HH
 - Hardware Monitor: avoid sprintf in src/ext_iokit.c
+- Hardware Monitor: Splitting hwmon_ctx_init function into smaller library-specific functions
 - Help: show supported hash-modes only with -hh
 - Makefile: prevent make failure with Apple Silicon in case of partial rebuild
+- Makefile: updated MACOSX_DEPLOYMENT_TARGET to 15.0
 - Rules: Add support to character class rules
 - Rules: Rename best64.rule to best66.rule and remove the unknown section from it

--- a/docs/device_status_code.txt
+++ b/docs/device_status_code.txt
@ -0,0 +1,20 @@
+Device Status Codes:
+====================
+
+ 0 = "Initializing"
+ 1 = "Autotuning"
+ 2 = "Selftest"
+ 3 = "Running"
+ 4 = "Paused"
+ 5 = "Exhausted"
+ 6 = "Cracked"
+ 7 = "Aborted"
+ 8 = "Quit"
+ 9 = "Bypass"
+10 = "Aborted (Checkpoint)"
+11 = "Aborted (Runtime)"
+12 = "Running (Checkpoint Quit requested)"
+13 = "Error"
+14 = "Aborted (Finish)"
+15 = "Running (Quit after attack requested)"
+16 = "Autodetect"
--- a/docs/exit_status_code.txt
+++ b/docs/exit_status_code.txt
--- a/docs/hashcat-assimilation-bridge-development.md
+++ b/docs/hashcat-assimilation-bridge-development.md
@ -23,7 +23,7 @@ static const u64   BRIDGE_TYPE = BRIDGE_TYPE_MATCH_TUNINGS
 static const char *BRIDGE_NAME = "scrypt_jane";
 ```

-* `BRIDGE_NAME` tells Hashcat which bridge to load (e.g., `bridge_scrypt_jane.so`).
+* `BRIDGE_NAME` tells hashcat which bridge to load (e.g., `bridge_scrypt_jane.so`).
 * `BRIDGE_TYPE` indicates which backend kernel functions the bridge will override:

  * `BRIDGE_TYPE_LAUNCH_LOOP`:   Entry point for all bridges that register to run after `RUN_LOOP`
@ -31,7 +31,7 @@ static const char *BRIDGE_NAME = "scrypt_jane";
  * `BRIDGE_TYPE_REPLACE_LOOP`:  Same as BRIDGE_TYPE_LAUNCH_LOOP, but deactivates `RUN_LOOP`
  * `BRIDGE_TYPE_REPLACE_LOOP2`: Same as BRIDGE_TYPE_LAUNCH_LOOP2, but deactivates `RUN_LOOP2`

-Hashcat loads the bridge dynamically and uses it for any declared invocation.
+hashcat loads the bridge dynamically and uses it for any declared invocation.

 Note that bridges only load for outside kernel, aka "slow hash" kernels. In "fast hash" kernels, such as MD5, they are ignored. In case you want to implement a "fast hash" + bridge hybrid, you can move the "fast hash" code into a new "slow hash" kernel.

@ -50,7 +50,7 @@ ATTACK_EXEC_OUTSIDE_KERNEL:
    RUN_PREPARE
    ITER_REPEATS:
      RUN_LOOP
-      RUN_EXTENTED
+      RUN_EXTENDED
    COPY_BRIDGE_MATERIAL_TO_HOST
    BRIDGE_LAUNCH_LOOP
    COPY_BRIDGE_MATERIAL_TO_DEVICE
@ -75,16 +75,16 @@ ATTACK_EXEC_OUTSIDE_KERNEL:
 - COPY_* refers to host-to-device or device-to-host copies and typically involve PCIe data transfer.
 - CALL_* are code functions executed on the host CPU. They are plugin-specific and defined in a module. They were the predecessor of bridges but are still usable.
 - SALT_* typically are optional steps which allow certain algorithms specific optimizations. For instance in Scrypt with P > 1, the V and XY buffer can be reused and allow temporary storage of result values into B. This saves memory requirement, improving parallelization
- ITER_* is the main loop that chunks what typically is defined as "iterations" in a algorithm computation. For instance a PBKDF2 function is called with 10,000 iterations, which would take a while to compute. The time this takes could be longer than a GPU drivers watchdog allows (before it resets the compute engine.). Hashcat will divide the 10,000 into chunks of let's say 1,000 and call the same kernel 10 times
+- ITER_* is the main loop that chunks what typically is defined as "iterations" in a algorithm computation. For instance a PBKDF2 function is called with 10,000 iterations, which would take a while to compute. The time this takes could be longer than a GPU drivers watchdog allows (before it resets the compute engine.). hashcat will divide the 10,000 into chunks of let's say 1,000 and call the same kernel 10 times
 - BRIDGE_* existing bridge entry points. During the "lifetime" of a hash computation the tmps[] variable is used (algorithm specific, so defined in the specific plugin module and kernel). This variable is which we refer to as bridge material, but it's possible we add other types of variables to "material" in the future
 - ITER2/LOOP2: Optional entry points in case the algorithm consists of two types of long running (high iterated) sub-components. For instance one iteration of 10k loops sha256 followed by 100k loops of sha512, or bcrypt followed by scrypt

  * `BRIDGE_TYPE_LAUNCH_INIT`
  * `BRIDGE_TYPE_LAUNCH_COMP`

-Hashcat devs will add support on request.
+hashcat devs will add support on request.

-As mentioned in the BRIDGE_* entry points, it's the developer's responsibility to ensure compatibility. That typically means the handling of the `tmps` variable relevant in the `kernel_loop` and how it changes over algorithm computations lifetime. Hashcat will take care of copying the data from and to the compute backend buffers (bridge material).
+As mentioned in the BRIDGE_* entry points, it's the developer's responsibility to ensure compatibility. That typically means the handling of the `tmps` variable relevant in the `kernel_loop` and how it changes over algorithm computations lifetime. hashcat will take care of copying the data from and to the compute backend buffers (bridge material).

 But the bridge developer must ensure data transformation compatibility. For instance, if we replace the loop section in SCRYPT (8900), the long running part is the smix() activity. But SCRYPT implements the PBKDF2 handling in both init and comp kernels, preparing the values in B[] after the init kernel, and expecting modified values in B[] before running comp kernel. If you want to replace the smix() section with let's say FPGA code, the bridge needs to understand the structure of the tmps[] variable. In this case tmps[] just reflect SCRYPT B[], making this simple, but other algorithms may require more than just one large buffer array. That means the structure itself (datatypes), but also the amount of workitems, because there's almost always more than one workitem (to reduce overhead times).

@ -95,7 +95,7 @@ There's some more BRIDGE PARAMETERs that you should know:

 ## How Bridges Work

-When Hashcat starts with a plugin that specifies a bridge, it loads the bridge and invokes its initialization function. The bridge must then discover its internal compute units, called *bridge units*. Handling the units must be implemented by the bridge developer, and typically involves loading some library, init it, and retrieve some resources available, for instances loading XRT, asking how many FPGA are available. If there's two FPGA, then the bridge unit count would be two. You also need to provide some detailed information on the unit itself, for instance the name of the device, or version or your software solution if it's not a hardware.
+When hashcat starts with a plugin that specifies a bridge, it loads the bridge and invokes its initialization function. The bridge must then discover its internal compute units, called *bridge units*. Handling the units must be implemented by the bridge developer, and typically involves loading some library, init it, and retrieve some resources available, for instances loading XRT, asking how many FPGA are available. If there's two FPGA, then the bridge unit count would be two. You also need to provide some detailed information on the unit itself, for instance the name of the device, or version or your software solution if it's not a hardware.

 Each of these bridge unit maps to one virtual backend device, which allows asynchronous and independent parallel execution, and this were virtual backend devices become relevant. Read section about virtual backend devices for a better understanding

@ -110,7 +110,7 @@ From the bridge_init() function you have access to the following generic paramet

 ## Virtual Backend Devices

-This feature is available also outside of bridges, eg in order to increase some workload on a compute device, but it was added in the first place to support bridges. The main problem is that it's possible that a bridge return 2 bridge units which may have different speeds (clocking), or an ideal batch size. The time it takes to compute a certain batch of passwords would be different, so there was a need for an asynchronous execution strategy. Hashcat supports mixed speed device types, but that typically mean "backend" devices. To solve the issue, we partition (virtualize) one physical backend device into multiple virtual backend devices (done internally by hashcat), and "link" each of the virtual backend device to a bridge unit. Due to this binding we can support bridge units of different speed. There's two flags a user can control in regard to virtual device backend:
+This feature is available also outside of bridges, eg in order to increase some workload on a compute device, but it was added in the first place to support bridges. The main problem is that it's possible that a bridge return 2 bridge units which may have different speeds (clocking), or an ideal batch size. The time it takes to compute a certain batch of passwords would be different, so there was a need for an asynchronous execution strategy. hashcat supports mixed speed device types, but that typically mean "backend" devices. To solve the issue, we partition (virtualize) one physical backend device into multiple virtual backend devices (done internally by hashcat), and "link" each of the virtual backend device to a bridge unit. Due to this binding we can support bridge units of different speed. There's two flags a user can control in regard to virtual device backend:

 * Use `-Y` to define how many virtual backend devices to create.
 * Use `-R` to bind these virtual devices to a physical backend host (new in v7).
--- a/docs/hashcat-assimilation-bridge.md
+++ b/docs/hashcat-assimilation-bridge.md
@ -70,4 +70,4 @@ Depending on interface compatibility, code from other password cracking tools (e

 The Assimilation Bridge introduces a highly extensible mechanism to integrate custom compute resources and logic into Hashcat.

-For hands-on examples and developer guidance, refer to the accompanying documentation in `docs/hashcat-assimiliation-bridge-development.md` (first draft).
+For hands-on examples and developer guidance, refer to the accompanying documentation in `docs/hashcat-assimilation-bridge-development.md` (first draft).
--- a/docs/hashcat-python-plugin-development-guide.md
+++ b/docs/hashcat-python-plugin-development-guide.md
@ -210,13 +210,13 @@ Notes:

 If you modify one of these plugin files, there's a trade-off: you won’t be able to contribute that code directly to the upstream Hashcat repository, since those files are meant to remain clean for demonstration purposes.

-To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. You can override the Python script to be loaded using `--bridge-parameter1`:
+To address this, the assimilation bridge provides a generic parameter that users can specify via the command line. In the case of the Python bridge, only the first parameter is used. Using `--bridge-parameter1` allows you to override the Python script to be loaded:

 ```
-$ ./hashcat -m 73000 --bridge-parameter1 myimplementation.py hash.txt wordlist.txt ...
+$ ./hashcat -m 73000 --bridge-parameter1 ./Python/myimplementation.py hash.txt wordlist.txt ...
 ```

-This tells the Python bridge plugin to load `myimplementation.py` instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.
+This tells the Python bridge plugin to load `myimplementation.py` located in the local `Python` subdirectory instead of the default `generic_hash_mp.py`. This approach is especially useful if you plan to contribute `myimplementation.py` to the upstream Hashcat repository. If you choose to stay within the generic mode, your Python code won’t have a dedicated hash mode, and you'll need to instruct users to use the `--bridge-parameter1` flag to load your implementation.

 ### Design Tradeoffs and Format Considerations

--- a/docs/readme.txt
+++ b/docs/readme.txt
@ -57,6 +57,7 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
 - SHA3-384
 - SHA3-512
 - RIPEMD-160
+- RIPEMD-320
 - BLAKE2b-512
 - BLAKE2s-256
 - SM3
@ -86,11 +87,13 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
 - md5($salt.md5($salt.$pass))
 - md5($salt.sha1($salt.$pass))
 - md5($salt.utf16le($pass))
+- md5($salt1.$pass.$salt2)
 - md5($salt1.sha1($salt2.$pass))
 - md5($salt1.strtoupper(md5($salt2.$pass)))
 - md5(md5($pass))
 - md5(md5($pass).md5($salt))
 - md5(md5($pass.$salt))
+- md5($salt.md5($pass).$salt)
 - md5(md5(md5($pass)))
 - md5(md5(md5($pass)).$salt)
 - md5(md5(md5($pass).$salt1).$salt2)
@ -143,6 +146,8 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or
 - HMAC-MD5 (key = $salt)
 - HMAC-RIPEMD160 (key = $pass)
 - HMAC-RIPEMD160 (key = $salt)
+- HMAC-RIPEMD320 (key = $pass)
+- HMAC-RIPEMD320 (key = $salt)
 - HMAC-SHA1 (key = $pass)
 - HMAC-SHA1 (key = $salt)
 - HMAC-SHA256 (key = $pass)
--- a/extra/tab_completion/hashcat.sh
+++ b/extra/tab_completion/hashcat.sh
@ -425,7 +425,7 @@ _hashcat ()
  local HIDDEN_FILES_AGGRESSIVE="${HIDDEN_FILES}|hcmask|hcchr"
  local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s ?h ?H'

-  local SHORT_OPTS="-m -a -V -h -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y -R"
+  local SHORT_OPTS="-m -a -V -h -H -b -t -T -o -p -c -d -D -w -n -u -j -k -r -g -1 -2 -3 -4 -i -I -s -l -O -S -z -M -Y -R"
  local LONG_OPTS="--hash-type --attack-mode --version --help --quiet --benchmark --benchmark-all --hex-salt --hex-wordlist --hex-charset --force --status --status-json --status-timer --stdin-timeout-abort --machine-readable --loopback --markov-hcstat2 --markov-disable --markov-inverse --markov-classic --markov-threshold --runtime --session --speed-only --progress-only --restore --restore-file-path --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-json --outfile-check-timer --outfile-check-dir --wordlist-autohex-disable --separator --show --deprecated-check-disable --left --username --dynamic-x --remove --remove-timer --potfile-disable --potfile-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --example-hashes --hash-info --backend-ignore-cuda --backend-ignore-opencl --backend-ignore-hip --backend-ignore-metal --backend-info --backend-devices --backend-devices-virtmulti --backend-devices-virthost --backend-devices-keepfree --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-disable --hwmon-temp-abort --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --stdout --keep-guessing --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --optimized-kernel-enable --multiply-accel-disable --self-test-disable --slow-candidates --brain-server --brain-server-timer --brain-client --brain-client-features --brain-host --brain-port --brain-session --brain-session-whitelist --brain-password --identify --bridge-parameter1 --bridge-parameter2 --bridge-parameter3 --bridge-parameter4"
  local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --status-timer --stdin-timeout-abort --markov-hcstat2 --markov-threshold --runtime --session --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --potfile-path --restore-file-path --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --backend-devices --backend-devices-virtmulti --backend-devices-virthost --backend-devices-keepfree --opencl-device-types --backend-vector-width --workload-profile --kernel-accel --kernel-loops --kernel-threads --spin-damp --hwmon-temp-abort --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-func-sel --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --hook-threads --increment-min --increment-max --scrypt-tmto --keyboard-layout-mapping --truecrypt-keyfiles --veracrypt-keyfiles --veracrypt-pim-start --veracrypt-pim-stop --hccapx-message-pair --nonce-error-corrections --encoding-from --encoding-to --brain-server-timer --brain-client-features --brain-host --brain-password --brain-port --brain-session --brain-session-whitelist --bridge-parameter1 --bridge-parameter2 --bridge-parameter3 --bridge-parameter4"

@ -729,11 +729,11 @@ _hashcat ()

  while [ ${h} -le ${COMP_CWORD} ]; do

-    if   [[ "${COMP_WORDS[h]}" == "-a" ]]; then
+    if [[ "${COMP_WORDS[h]}" == "-a" ]]; then

      attack_mode=${COMP_WORDS[$((h + 1))]}

-    elif   [[ "${COMP_WORDS[h]}" == -a* ]]; then
+    elif [[ "${COMP_WORDS[h]}" == -a* ]]; then

      attack_mode=${COMP_WORDS[h]:2}

--- a/include/ext_OpenCL.h
+++ b/include/ext_OpenCL.h
@ -51,7 +51,7 @@ typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEFILLBUFFER)       (cl_comman
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUECOPYBUFFER)       (cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
 typedef void *           (CL_API_CALL *OCL_CLENQUEUEMAPBUFFER)        (cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUENDRANGEKERNEL)    (cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEREADBUFFER)       (cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEUNMAPMEMOBJECT)   (cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLENQUEUEWRITEBUFFER)      (cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
 typedef cl_int           (CL_API_CALL *OCL_CLFINISH)                  (cl_command_queue);
--- a/include/ext_cuda.h
+++ b/include/ext_cuda.h
@ -1154,14 +1154,19 @@ typedef CUresult (CUDA_API_CALL *CUDA_CUINIT)                   (unsigned int);
 typedef CUresult (CUDA_API_CALL *CUDA_CULAUNCHKERNEL)           (CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOC)               (CUdeviceptr *, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMALLOCHOST)           (void **, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOD)             (CUdeviceptr, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOH)             (void *, CUdeviceptr, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTOD)             (CUdeviceptr, const void *, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32)              (CUdeviceptr, unsigned int, size_t);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8)               (CUdeviceptr, unsigned char, size_t);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTODASYNC)        (CUdeviceptr, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYDTOHASYNC)        (void *, CUdeviceptr, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMCPYHTODASYNC)        (CUdeviceptr, const void *, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
+typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREE)                (CUdeviceptr);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMFREEHOST)            (void *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMEMGETINFO)             (size_t *, size_t *);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD32ASYNC)         (CUdeviceptr, unsigned int, size_t, CUstream);
-typedef CUresult (CUDA_API_CALL *CUDA_CUMEMSETD8ASYNC)          (CUdeviceptr, unsigned char, size_t, CUstream);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETFUNCTION)      (CUfunction *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULEGETGLOBAL)        (CUdeviceptr *, size_t *, CUmodule, const char *);
 typedef CUresult (CUDA_API_CALL *CUDA_CUMODULELOAD)             (CUmodule *, const char *);
@ -1217,14 +1222,19 @@ typedef struct hc_cuda_lib
  CUDA_CULAUNCHKERNEL           cuLaunchKernel;
  CUDA_CUMEMALLOC               cuMemAlloc;
  CUDA_CUMEMALLOCHOST           cuMemAllocHost;
+  CUDA_CUMEMCPYDTOD             cuMemcpyDtoD;
+  CUDA_CUMEMCPYDTOH             cuMemcpyDtoH;
+  CUDA_CUMEMCPYHTOD             cuMemcpyHtoD;
+  CUDA_CUMEMSETD32              cuMemsetD32;
+  CUDA_CUMEMSETD8               cuMemsetD8;
  CUDA_CUMEMCPYDTODASYNC        cuMemcpyDtoDAsync;
  CUDA_CUMEMCPYDTOHASYNC        cuMemcpyDtoHAsync;
  CUDA_CUMEMCPYHTODASYNC        cuMemcpyHtoDAsync;
+  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
+  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
  CUDA_CUMEMFREE                cuMemFree;
  CUDA_CUMEMFREEHOST            cuMemFreeHost;
  CUDA_CUMEMGETINFO             cuMemGetInfo;
-  CUDA_CUMEMSETD32ASYNC         cuMemsetD32Async;
-  CUDA_CUMEMSETD8ASYNC          cuMemsetD8Async;
  CUDA_CUMODULEGETFUNCTION      cuModuleGetFunction;
  CUDA_CUMODULEGETGLOBAL        cuModuleGetGlobal;
  CUDA_CUMODULELOAD             cuModuleLoad;
@ -1272,13 +1282,18 @@ int hc_cuFuncSetAttribute      (void *hashcat_ctx, CUfunction hfunc, CUfunction_
 int hc_cuInit                  (void *hashcat_ctx, unsigned int Flags);
 int hc_cuLaunchKernel          (void *hashcat_ctx, CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
 int hc_cuMemAlloc              (void *hashcat_ctx, CUdeviceptr *dptr, size_t bytesize);
+int hc_cuMemcpyDtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyDtoH            (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+int hc_cuMemcpyHtoD            (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+int hc_cuMemsetD32             (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N);
+int hc_cuMemsetD8              (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N);
 int hc_cuMemcpyDtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyDtoHAsync       (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
 int hc_cuMemcpyHtoDAsync       (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
-int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuMemsetD32Async        (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
 int hc_cuMemsetD8Async         (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+int hc_cuMemFree               (void *hashcat_ctx, CUdeviceptr dptr);
+int hc_cuMemGetInfo            (void *hashcat_ctx, size_t *free, size_t *total);
 int hc_cuModuleGetFunction     (void *hashcat_ctx, CUfunction *hfunc, CUmodule hmod, const char *name);
 int hc_cuModuleGetGlobal       (void *hashcat_ctx, CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
 int hc_cuModuleLoadDataEx      (void *hashcat_ctx, CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
--- a/include/ext_hip.h
+++ b/include/ext_hip.h
--- a/include/ext_metal.h
+++ b/include/ext_metal.h
@ -111,7 +111,7 @@ int  hc_mtlCreateLibraryWithFile    (void *hashcat_ctx, mtl_device_id metal_devi
 int  hc_mtlEncodeComputeCommand_pre (void *hashcat_ctx, mtl_pipeline metal_pipeline, mtl_command_queue metal_command_queue, mtl_command_buffer *metal_command_buffer, mtl_command_encoder *metal_command_encoder);
 int  hc_mtlSetCommandEncoderArg     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, size_t off, size_t idx, mtl_mem buf, void *host_data, size_t host_data_size);

-int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms);
+int  hc_mtlEncodeComputeCommand     (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms);

 #endif // __APPLE__

--- a/include/shared.h
+++ b/include/shared.h
@ -33,6 +33,7 @@ bool overflow_check_u64_add (const u64 a, const u64 b);
 bool overflow_check_u64_mul (const u64 a, const u64 b);

 bool is_power_of_2 (const u32 v);
+u32 smallest_repeat_double (const u32 v);

 u32 get_random_num (const u32 min, const u32 max);

--- a/include/types.h
+++ b/include/types.h
@ -115,6 +115,7 @@ typedef enum event_identifier
  EVENT_BRIDGES_SALT_POST         = 0x00000122,
  EVENT_BRIDGES_SALT_PRE          = 0x00000123,
  EVENT_CALCULATED_WORDS_BASE     = 0x00000020,
+  EVENT_CALCULATED_WORDS_CNT      = 0x00000021,
  EVENT_CRACKER_FINISHED          = 0x00000030,
  EVENT_CRACKER_HASH_CRACKED      = 0x00000031,
  EVENT_CRACKER_STARTING          = 0x00000032,
@ -424,6 +425,9 @@ typedef enum opti_type
  OPTI_TYPE_REGISTER_LIMIT        = (1 << 20), // We'll limit the register count to 128
  OPTI_TYPE_SLOW_HASH_SIMD_INIT2  = (1 << 21),
  OPTI_TYPE_SLOW_HASH_SIMD_LOOP2  = (1 << 22),
+  OPTI_TYPE_SLOW_HASH_DIMY_INIT   = (1 << 23),
+  OPTI_TYPE_SLOW_HASH_DIMY_LOOP   = (1 << 24),
+  OPTI_TYPE_SLOW_HASH_DIMY_COMP   = (1 << 25),

 } opti_type_t;

@ -488,14 +492,17 @@ typedef enum opts_type
  OPTS_TYPE_DYNAMIC_SHARED    = (1ULL << 53), // use dynamic shared memory (note: needs special kernel changes)
  OPTS_TYPE_SELF_TEST_DISABLE = (1ULL << 54), // some algos use JiT in combinations with a salt or create too much startup time
  OPTS_TYPE_MP_MULTI_DISABLE  = (1ULL << 55), // do not multiply the kernel-accel with the multiprocessor count per device to allow more fine-tuned workload settings
-  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 56), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
-  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 57), // disable else branch in pre-compilation thread count optimization setting
-  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 58), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
+  OPTS_TYPE_THREAD_MULTI_DISABLE              // do not multiply the kernel-power with the thread count per device for super slow algos
+                              = (1ULL << 56),
+  OPTS_TYPE_NATIVE_THREADS    = (1ULL << 57), // forces "native" thread count: CPU=1, GPU-Intel=8, GPU-AMD=64 (wavefront), GPU-NV=32 (warps)
+  OPTS_TYPE_MAXIMUM_THREADS   = (1ULL << 58), // disable else branch in pre-compilation thread count optimization setting
+  OPTS_TYPE_POST_AMP_UTF16LE  = (1ULL << 59), // run the utf8 to utf16le conversion kernel after they have been processed from amplifiers
  OPTS_TYPE_AUTODETECT_DISABLE
-                              = (1ULL << 59), // skip autodetect engine
-  OPTS_TYPE_STOCK_MODULE      = (1ULL << 60), // module included with hashcat default distribution
+                              = (1ULL << 60), // skip autodetect engine
+  OPTS_TYPE_STOCK_MODULE      = (1ULL << 61), // module included with hashcat default distribution
  OPTS_TYPE_MULTIHASH_DESPITE_ESALT
-                              = (1ULL << 61), // overrule multihash cracking check same salt but not same esalt
+                              = (1ULL << 62), // overrule multihash cracking check same salt but not same esalt
+  OPTS_TYPE_MAXIMUM_ACCEL     = (1ULL << 63)  // try to maximize kernel-accel during autotune

 } opts_type_t;

@ -539,6 +546,7 @@ typedef enum dgst_size
  DGST_SIZE_4_6  = (6  * sizeof (u32)), // 24
  DGST_SIZE_4_7  = (7  * sizeof (u32)), // 28
  DGST_SIZE_4_8  = (8  * sizeof (u32)), // 32
+  DGST_SIZE_4_10 = (10 * sizeof (u32)), // 40
  DGST_SIZE_4_16 = (16 * sizeof (u32)), // 64 !!!
  DGST_SIZE_4_32 = (32 * sizeof (u32)), // 128 !!!
  DGST_SIZE_4_64 = (64 * sizeof (u32)), // 256
@ -710,7 +718,7 @@ typedef enum user_options_defaults
  #else
  HWMON_TEMP_ABORT         = 90,
  #endif
-  HASH_INFO                = false,
+  HASH_INFO                = 0,
  HASH_MODE                = 0,
  HCCAPX_MESSAGE_PAIR      = 0,
  HEX_CHARSET              = false,
@ -726,6 +734,7 @@ typedef enum user_options_defaults
  KERNEL_LOOPS             = 0,
  KERNEL_THREADS           = 0,
  KEYSPACE                 = false,
+  TOTAL_CANDIDATES         = false,
  LEFT                     = false,
  LIMIT                    = 0,
  LOGFILE                  = true,
@ -827,13 +836,17 @@ typedef enum user_options_map
  IDX_CUSTOM_CHARSET_2          = '2',
  IDX_CUSTOM_CHARSET_3          = '3',
  IDX_CUSTOM_CHARSET_4          = '4',
+  IDX_CUSTOM_CHARSET_5          = '5',
+  IDX_CUSTOM_CHARSET_6          = '6',
+  IDX_CUSTOM_CHARSET_7          = '7',
+  IDX_CUSTOM_CHARSET_8          = '8',
  IDX_DEBUG_FILE                = 0xff12,
  IDX_DEBUG_MODE                = 0xff13,
  IDX_DEPRECATED_CHECK_DISABLE  = 0xff14,
  IDX_DYNAMIC_X                 = 0xff55,
  IDX_ENCODING_FROM             = 0xff15,
  IDX_ENCODING_TO               = 0xff16,
-  IDX_HASH_INFO                 = 0xff17,
+  IDX_HASH_INFO                 = 'H', // 0xff17
  IDX_FORCE                     = 0xff18,
  IDX_HWMON_DISABLE             = 0xff19,
  IDX_HWMON_TEMP_ABORT          = 0xff1a,
@ -909,6 +922,7 @@ typedef enum user_options_map
  IDX_STATUS_TIMER              = 0xff4c,
  IDX_STDOUT_FLAG               = 0xff4d,
  IDX_STDIN_TIMEOUT_ABORT       = 0xff4e,
+  IDX_TOTAL_CANDIDATES          = 0xff58,
  IDX_TRUECRYPT_KEYFILES        = 0xff4f,
  IDX_USERNAME                  = 0xff50,
  IDX_VERACRYPT_KEYFILES        = 0xff51,
@ -1387,6 +1401,8 @@ typedef struct hc_device_param
  u32     kernel_threads_min;
  u32     kernel_threads_max;

+  bool    overtune_unfriendly;  // whatever sets this decide we operate in a mode that is not allowing to overtune threads_max or accel_max in autotuner
+
  u64     kernel_power;
  u64     hardware_power;

@ -1519,6 +1535,7 @@ typedef struct hc_device_param
  bool    has_lop3;
  bool    has_mov64;
  bool    has_prmt;
+  bool    has_shfw;

  double  spin_damp;

@ -2209,6 +2226,8 @@ typedef struct outfile_ctx

  char   *filename;

+  hc_thread_mutex_t mux_outfile;
+
 } outfile_ctx_t;

 typedef struct pot
@ -2433,13 +2452,13 @@ typedef struct user_options
  bool         deprecated_check;
  bool         dynamic_x;
  bool         hwmon;
-  bool         hash_info;
  bool         hex_charset;
  bool         hex_salt;
  bool         hex_wordlist;
  bool         increment;
  bool         keep_guessing;
  bool         keyspace;
+  bool         total_candidates;
  bool         left;
  bool         logfile;
  bool         loopback;
@ -2484,7 +2503,6 @@ typedef struct user_options
  char        *bridge_parameter3;
  char        *bridge_parameter4;
  char        *cpu_affinity;
-  char        *custom_charset_4;
  char        *debug_file;
  char        *induction_dir;
  char        *keyboard_layout_mapping;
@ -2503,6 +2521,11 @@ typedef struct user_options
  const char  *custom_charset_1;
  const char  *custom_charset_2;
  const char  *custom_charset_3;
+  const char  *custom_charset_4;
+  const char  *custom_charset_5;
+  const char  *custom_charset_6;
+  const char  *custom_charset_7;
+  const char  *custom_charset_8;
  const char  *encoding_from;
  const char  *encoding_to;
  const char  *rule_buf_l;
@ -2526,6 +2549,7 @@ typedef struct user_options
  #endif
  u32          debug_mode;
  u32          hwmon_temp_abort;
+  u32          hash_info;
  int          hash_mode;
  u32          hccapx_message_pair;
  u32          hook_threads;
--- a/src/Makefile
+++ b/src/Makefile
@ -358,7 +358,7 @@ LFLAGS_NATIVE           += -lpthread
 endif # NetBSD

 ifeq ($(UNAME),Darwin)
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=15.0
 CFLAGS_NATIVE           := $(CFLAGS)
 CFLAGS_NATIVE           += -DWITH_HWMON

--- a/src/autotune.c
+++ b/src/autotune.c
@ -43,7 +43,8 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par
  device_param->kernel_param.loop_cnt = kernel_loops; // not a bug, both need to be set
  device_param->kernel_param.il_cnt   = kernel_loops; // because there's two variables for inner iters for slow and fast hashes

-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads);

  u32 kernel_power_try = hardware_power * kernel_accel;

@ -98,6 +99,7 @@ static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *devi

 static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)
 {
+  const hashes_t       *hashes       = hashcat_ctx->hashes;
  const hashconfig_t   *hashconfig   = hashcat_ctx->hashconfig;
  const backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
  const straight_ctx_t *straight_ctx = hashcat_ctx->straight_ctx;
@ -132,7 +134,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
    device_param->kernel_accel   = kernel_accel_min;
    device_param->kernel_loops   = kernel_loops_min;
    device_param->kernel_threads = kernel_threads_min;
-    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_min;
+    device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_min);
    device_param->kernel_power   = device_param->hardware_power * kernel_accel_min;
  }

@ -211,7 +214,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
    // from here it's clear we are allowed to autotune
    // so let's init some fake words

-    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads_max;
+    const u32 hardware_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                                 * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : kernel_threads_max);

    u32 kernel_power_max = hardware_power_max * kernel_accel_max;

@ -264,12 +268,12 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param

          if (device_param->is_cuda == true)
          {
-            if (hc_cuMemcpyDtoDAsync (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->cuda_stream) == -1) return -1;
+            if (hc_cuMemcpyDtoD (hashcat_ctx, device_param->cuda_d_rules_c, device_param->cuda_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
          }

          if (device_param->is_hip == true)
          {
-            if (hc_hipMemcpyDtoDAsync (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t), device_param->hip_stream) == -1) return -1;
+            if (hc_hipMemcpyDtoD (hashcat_ctx, device_param->hip_d_rules_c, device_param->hip_d_rules, MIN (kernel_loops_max, KERNEL_RULES) * sizeof (kernel_rule_t)) == -1) return -1;
          }

          #if defined (__APPLE__)
@ -297,13 +301,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
    {
      const u32 kernel_threads_sav = device_param->kernel_threads;

-      device_param->kernel_threads = device_param->kernel_wgs1;
+      device_param->kernel_threads = MIN (device_param->kernel_wgs1, kernel_threads_max);

      run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0, true);

      if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE)
      {
-        device_param->kernel_threads = device_param->kernel_wgs2p;
+        device_param->kernel_threads = MIN (device_param->kernel_wgs2p, kernel_threads_max);

        run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0, true);
      }
@ -328,18 +332,122 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
    }

    // v7 autotuner is a lot more straight forward
+    // we start with some purely theoretical values as a base, then move on to some meassured tests

-    for (u32 kernel_loops_test = kernel_loops_min; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL)
    {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_test, kernel_threads_min, 2);
+      if (kernel_accel_min < kernel_accel_max)
+      {
+        // let's also do some minimal accel, this is only to improve early meassurements taken with try_run()

-      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops_test, kernel_threads_min);
+        const u32 kernel_accel_start = previous_power_of_two (kernel_accel_max / 8);
+
+        if ((kernel_accel_start >= kernel_accel_min) && (kernel_accel_start <= kernel_accel_max))
+        {
+          kernel_accel = kernel_accel_start;
+        }
+      }
+    }
+
+    if (kernel_threads_min < kernel_threads_max)
+    {
+      // there could be a situation, like in 18600, where we have a thread_min which is not a multiple of
+      // kernel_preferred_wgs_multiple. As long as it's only a threads_min, but not a threads_max, we
+      // should stick to at least kernel_preferred_wgs_multiple
+
+      if (kernel_threads_min % device_param->kernel_preferred_wgs_multiple)
+      {
+        if ((device_param->kernel_preferred_wgs_multiple >= kernel_threads_min) && (device_param->kernel_preferred_wgs_multiple <= kernel_threads_max))
+        {
+          kernel_threads = device_param->kernel_preferred_wgs_multiple;
+        }
+      }
+    }
+
+    if (hashconfig->attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
+    {
+      if (hashes && hashes->salts_buf)
+      {
+        u32 start = kernel_loops_max;
+
+        const u32 salt_iter = hashes->salts_buf->salt_iter; // we use the first salt as reference
+
+        if (salt_iter)
+        {
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter));
+          start = MIN (start, smallest_repeat_double (hashes->salts_buf->salt_iter + 1));
+
+          if (((hashes->salts_buf->salt_iter + 0) % 125) == 0) start = MIN (start, 125);
+          if (((hashes->salts_buf->salt_iter + 1) % 125) == 0) start = MIN (start, 125);
+
+          if ((start >= kernel_loops_min) && (start <= kernel_loops_max))
+          {
+            kernel_loops = start;
+          }
+        }
+        else
+        {
+          // how can there be a slow hash with no iterations?
+        }
+      }
+    }
+    else
+    {
+      // let's also do some minimal loops, this is only to improve early meassurements taken with try_run()
+
+      const u32 kernel_loops_start = previous_power_of_two (kernel_loops_max / 4);
+
+      if ((kernel_loops_start >= kernel_loops_min) && (kernel_loops_start <= kernel_loops_max))
+      {
+        kernel_loops = kernel_loops_start;
+      }
+    }
+
+    if (1)
+    {
+      // some algorithm start ways to high with these theoretical preset (for instance, 8700)
+      // so much that they can't be tuned anymore
+
+      while ((kernel_accel > kernel_accel_min) || (kernel_threads > kernel_threads_min) || (kernel_loops > kernel_loops_min))
+      {
+        double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);
+
+        if (exec_msec < target_msec / 16) break;
+
+        if (kernel_accel > kernel_accel_min)
+        {
+          kernel_accel = MAX (kernel_accel / 2, kernel_accel_min);
+
+          continue;
+        }
+
+        if (kernel_threads > kernel_threads_min)
+        {
+          kernel_threads = MAX (kernel_threads / 2, kernel_threads_min);
+
+          continue;
+        }
+
+        if (kernel_loops > kernel_loops_min)
+        {
+          kernel_loops = MAX (kernel_loops / 2, kernel_loops_min);
+
+          continue;
+        }
+      }
+    }
+
+    for (u32 kernel_loops_test = kernel_loops; kernel_loops_test <= kernel_loops_max; kernel_loops_test <<= 1)
+    {
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops_test, kernel_threads, 2);
+
+      //printf ("loop %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops_test, kernel_threads);
      if (exec_msec > target_msec) break;

      // we want a little room for threads to play with so not full target_msec
      // but of course only if we are going to make use of that :)

-      if ((kernel_accel_min < kernel_accel_max) || (kernel_threads_min < kernel_threads_max))
+      if ((kernel_accel < kernel_accel_max) || (kernel_threads < kernel_threads_max))
      {
        if (exec_msec > target_msec / 8) break;

@ -353,21 +461,46 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
      kernel_loops = kernel_loops_test;
    }

-    for (u32 kernel_threads_test = kernel_threads_min; kernel_threads_test <= kernel_threads_max; kernel_threads_test <<= 1)
-    {
-      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads_test, 2);
+    double exec_msec_init = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 2);

-      //printf ("threads %f %u %u %u\n", exec_msec, kernel_accel_min, kernel_loops, kernel_threads_test);
+    float threads_eff_best = exec_msec_init / kernel_threads;
+    u32   threads_cnt_best = kernel_threads;
+
+    float threads_eff_prev = 0;
+    u32   threads_cnt_prev = 0;
+
+    for (u32 kernel_threads_test = kernel_threads; kernel_threads_test <= kernel_threads_max; kernel_threads_test = (kernel_threads_test < device_param->kernel_preferred_wgs_multiple) ? kernel_threads_test << 1 : kernel_threads_test + device_param->kernel_preferred_wgs_multiple)
+    {
+      double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads_test, 2);
+
+      //printf ("thread %f %u %u %u\n", exec_msec, kernel_accel, kernel_loops, kernel_threads_test);
      if (exec_msec > target_msec) break;

      if (kernel_threads >= 32)
      {
        // we want a little room for accel to play with so not full target_msec

-        if (exec_msec > target_msec / 8) break;
+        if (exec_msec > target_msec / 4) break;
      }

      kernel_threads = kernel_threads_test;
+
+      threads_eff_prev = exec_msec / kernel_threads_test;
+      threads_cnt_prev = kernel_threads_test;
+
+      //printf ("%f\n", threads_eff_prev);
+
+      if (threads_eff_prev < threads_eff_best)
+      {
+        threads_eff_best = threads_eff_prev;
+        threads_cnt_best = threads_cnt_prev;
+      }
+    }
+
+    // now we decide to choose either maximum or in some extreme cases prefer more efficient ones
+    if ((threads_eff_best * 1.06) < threads_eff_prev)
+    {
+      kernel_threads = threads_cnt_best;
    }

    #define STEPS_CNT 12
@ -401,20 +534,21 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
      if (kernel_accel > kernel_accel_max) kernel_accel = kernel_accel_max;
    }

-    if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
+    // overtune section. relevant if we have strange numbers from the APIs, namely 96, 384, and such
+    // this is a dangerous action, and we set conditions somewhere in the code to disable this

-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
-    {
-      if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
-    }
-
-    // some final play, if we have strange numbers from the APIs, namely 96, 384, and such
-
-    if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max))
+    if ((kernel_accel_min == kernel_accel_max) || (kernel_threads_min == kernel_threads_max) || (device_param->overtune_unfriendly == true))
    {
    }
    else
    {
+      if (kernel_accel > 64) kernel_accel -= kernel_accel % 32;
+
+      if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+      {
+        if (kernel_accel > device_param->device_processors) kernel_accel -= kernel_accel % device_param->device_processors;
+      }
+
      u32 fun[2];

      if (is_power_of_2 (kernel_threads) == false)
@ -539,7 +673,8 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param
  device_param->kernel_loops   = kernel_loops;
  device_param->kernel_threads = kernel_threads;

-  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads;
+  const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE)     ? 1 : device_param->device_processors)
+                           * ((hashconfig->opts_type & OPTS_TYPE_THREAD_MULTI_DISABLE) ? 1 : device_param->kernel_threads);

  device_param->hardware_power = hardware_power;

@ -578,7 +713,7 @@ HC_API_CALL void *thread_autotune (void *p)

  if (device_param->is_hip == true)
  {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
  }

  // check for autotune failure
@ -594,10 +729,6 @@ HC_API_CALL void *thread_autotune (void *p)
    if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
  }

-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
  return NULL;
 }
+
--- a/src/backend.c
+++ b/src/backend.c
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -81,8 +81,8 @@ int bitmap_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage         > 0)    return 0;
  if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;

-  if (user_options->hash_info    == true) return 0;
  if (user_options->keyspace     == true) return 0;
  if (user_options->left         == true) return 0;
  if (user_options->show         == true) return 0;
@ -110,6 +110,8 @@ int bitmap_ctx_init (hashcat_ctx_t *hashcat_ctx)
  u32 *bitmap_s2_c = (u32 *) hcmalloc ((1U << bitmap_max) * sizeof (u32));
  u32 *bitmap_s2_d = (u32 *) hcmalloc ((1U << bitmap_max) * sizeof (u32));

+  if (!bitmap_s1_a || !bitmap_s1_b || !bitmap_s1_c || !bitmap_s1_d || !bitmap_s2_a || !bitmap_s2_b || !bitmap_s2_c || !bitmap_s2_d) return -1;
+
  u32 bitmap_bits;
  u32 bitmap_nums;
  u32 bitmap_mask;
--- a/src/brain.c
+++ b/src/brain.c
@ -345,6 +345,34 @@ u32 brain_compute_attack (hashcat_ctx_t *hashcat_ctx)

      XXH64_update (state, custom_charset_4, strlen (custom_charset_4));
    }
+
+    if (user_options->custom_charset_5)
+    {
+      const char *custom_charset_5 = user_options->custom_charset_5;
+
+      XXH64_update (state, custom_charset_5, strlen (custom_charset_5));
+    }
+
+    if (user_options->custom_charset_6)
+    {
+      const char *custom_charset_6 = user_options->custom_charset_6;
+
+      XXH64_update (state, custom_charset_6, strlen (custom_charset_6));
+    }
+
+    if (user_options->custom_charset_7)
+    {
+      const char *custom_charset_7 = user_options->custom_charset_7;
+
+      XXH64_update (state, custom_charset_7, strlen (custom_charset_7));
+    }
+
+    if (user_options->custom_charset_8)
+    {
+      const char *custom_charset_8 = user_options->custom_charset_8;
+
+      XXH64_update (state, custom_charset_8, strlen (custom_charset_8));
+    }
  }
  else if (user_options->attack_mode == ATTACK_MODE_HYBRID1)
  {
@ -405,6 +433,34 @@ u32 brain_compute_attack (hashcat_ctx_t *hashcat_ctx)
      XXH64_update (state, custom_charset_4, strlen (custom_charset_4));
    }

+    if (user_options->custom_charset_5)
+    {
+      const char *custom_charset_5 = user_options->custom_charset_5;
+
+      XXH64_update (state, custom_charset_5, strlen (custom_charset_5));
+    }
+
+    if (user_options->custom_charset_6)
+    {
+      const char *custom_charset_6 = user_options->custom_charset_6;
+
+      XXH64_update (state, custom_charset_6, strlen (custom_charset_6));
+    }
+
+    if (user_options->custom_charset_7)
+    {
+      const char *custom_charset_7 = user_options->custom_charset_7;
+
+      XXH64_update (state, custom_charset_7, strlen (custom_charset_7));
+    }
+
+    if (user_options->custom_charset_8)
+    {
+      const char *custom_charset_8 = user_options->custom_charset_8;
+
+      XXH64_update (state, custom_charset_8, strlen (custom_charset_8));
+    }
+
    const int hex_wordlist = user_options->hex_wordlist;

    XXH64_update (state, &hex_wordlist, sizeof (hex_wordlist));
--- a/src/bridges.c
+++ b/src/bridges.c
@ -87,12 +87,12 @@ bool bridges_init (hashcat_ctx_t *hashcat_ctx)
  user_options_t  *user_options = hashcat_ctx->user_options;
  hashconfig_t    *hashconfig   = hashcat_ctx->hashconfig;

-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
  if (user_options->left         == true) return true;
  if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
  if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;

  // There is a problem here. At this point, hashconfig is not yet initialized.
  // This is because initializing hashconfig requires the module to be loaded,
@ -241,12 +241,12 @@ bool bridges_salt_prepare (hashcat_ctx_t *hashcat_ctx)
  hashes_t        *hashes       = hashcat_ctx->hashes;
  user_options_t  *user_options = hashcat_ctx->user_options;

-  if (user_options->hash_info    == true) return true;
+  if (user_options->backend_info  > 0)    return true;
+  if (user_options->hash_info     > 0)    return true;
+  if (user_options->usage         > 0)    return true;
  if (user_options->left         == true) return true;
  if (user_options->show         == true) return true;
-  if (user_options->usage         > 0)    return true;
  if (user_options->version      == true) return true;
-  if (user_options->backend_info  > 0)    return true;

  if (bridge_ctx->enabled == false) return true;

--- a/src/combinator.c
+++ b/src/combinator.c
@ -21,8 +21,8 @@ int combinator_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage         > 0)    return 0;
  if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;

-  if (user_options->hash_info    == true) return 0;
  if (user_options->left         == true) return 0;
  if (user_options->show         == true) return 0;
  if (user_options->version      == true) return 0;
--- a/src/cpt.c
+++ b/src/cpt.c
@ -17,8 +17,8 @@ int cpt_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage         > 0)    return 0;
  if (user_options->backend_info  > 0)    return 0;
+  if (user_options->hash_info     > 0)    return 0;

-  if (user_options->hash_info    == true) return 0;
  if (user_options->keyspace     == true) return 0;
  if (user_options->left         == true) return 0;
  if (user_options->show         == true) return 0;
--- a/src/debugfile.c
+++ b/src/debugfile.c
@ -118,9 +118,9 @@ int debugfile_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage          > 0)    return 0;
  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;

  if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
  if (user_options->keyspace      == true) return 0;
  if (user_options->left          == true) return 0;
  if (user_options->show          == true) return 0;
--- a/src/dictstat.c
+++ b/src/dictstat.c
@ -58,9 +58,9 @@ int dictstat_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage          > 0)    return 0;
  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;

  if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
  if (user_options->keyspace      == true) return 0;
  if (user_options->left          == true) return 0;
  if (user_options->show          == true) return 0;
--- a/src/dispatch.c
+++ b/src/dispatch.c
@ -381,7 +381,7 @@ HC_API_CALL void *thread_calc_stdin (void *p)

  if (device_param->is_hip == true)
  {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
  }

  if (calc_stdin (hashcat_ctx, device_param) == -1)
@ -396,11 +396,6 @@ HC_API_CALL void *thread_calc_stdin (void *p)
    if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
  }

-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
  if (bridge_ctx->enabled == true)
  {
    if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
@ -1685,7 +1680,7 @@ HC_API_CALL void *thread_calc (void *p)

  if (device_param->is_hip == true)
  {
-    if (hc_hipCtxPushCurrent (hashcat_ctx, device_param->hip_context) == -1) return NULL;
+    if (hc_hipSetDevice (hashcat_ctx, device_param->hip_device) == -1) return NULL;
  }

  if (calc (hashcat_ctx, device_param) == -1)
@ -1700,11 +1695,6 @@ HC_API_CALL void *thread_calc (void *p)
    if (hc_cuCtxPopCurrent (hashcat_ctx, &device_param->cuda_context) == -1) return NULL;
  }

-  if (device_param->is_hip == true)
-  {
-    if (hc_hipCtxPopCurrent (hashcat_ctx, &device_param->hip_context) == -1) return NULL;
-  }
-
  if (bridge_ctx->enabled == true)
  {
    if (bridge_ctx->thread_term != BRIDGE_DEFAULT)
--- a/src/ext_cuda.c
+++ b/src/ext_cuda.c
@ -87,14 +87,19 @@ int cuda_init (void *hashcat_ctx)
  HC_LOAD_FUNC_CUDA (cuda, cuLaunchKernel,           cuLaunchKernel,            CUDA_CULAUNCHKERNEL,            CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemAlloc,               cuMemAlloc_v2,             CUDA_CUMEMALLOC,                CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemAllocHost,           cuMemAllocHost_v2,         CUDA_CUMEMALLOCHOST,            CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoD,             cuMemcpyDtoD_v2,           CUDA_CUMEMCPYDTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoH,             cuMemcpyDtoH_v2,           CUDA_CUMEMCPYDTOH,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoD,             cuMemcpyHtoD_v2,           CUDA_CUMEMCPYHTOD,              CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32,              cuMemsetD32,               CUDA_CUMEMSETD32,               CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8,               cuMemsetD8,                CUDA_CUMEMSETD8,                CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoDAsync,        cuMemcpyDtoDAsync_v2,      CUDA_CUMEMCPYDTODASYNC,         CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyDtoHAsync,        cuMemcpyDtoHAsync_v2,      CUDA_CUMEMCPYDTOHASYNC,         CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemcpyHtoDAsync,        cuMemcpyHtoDAsync_v2,      CUDA_CUMEMCPYHTODASYNC,         CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
+  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemFree,                cuMemFree_v2,              CUDA_CUMEMFREE,                 CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemFreeHost,            cuMemFreeHost,             CUDA_CUMEMFREEHOST,             CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuMemGetInfo,             cuMemGetInfo_v2,           CUDA_CUMEMGETINFO,              CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD32Async,         cuMemsetD32Async,          CUDA_CUMEMSETD32ASYNC,          CUDA, 1);
-  HC_LOAD_FUNC_CUDA (cuda, cuMemsetD8Async,          cuMemsetD8Async,           CUDA_CUMEMSETD8ASYNC,           CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetFunction,      cuModuleGetFunction,       CUDA_CUMODULEGETFUNCTION,       CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuModuleGetGlobal,        cuModuleGetGlobal_v2,      CUDA_CUMODULEGETGLOBAL,         CUDA, 1);
  HC_LOAD_FUNC_CUDA (cuda, cuModuleLoad,             cuModuleLoad,              CUDA_CUMODULELOAD,              CUDA, 1);
@ -517,6 +522,142 @@ int hc_cuMemFree (void *hashcat_ctx, CUdeviceptr dptr)
  return 0;
 }

+
+int hc_cuMemcpyDtoH (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoH(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyDtoD (void *hashcat_ctx, CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyDtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemcpyHtoD (void *hashcat_ctx, CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemcpyHtoD(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD32 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD32 (dstDevice, ui, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD32(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_cuMemsetD8 (void *hashcat_ctx, CUdeviceptr dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  CUDA_PTR *cuda = (CUDA_PTR *) backend_ctx->cuda;
+
+  const CUresult CU_err = cuda->cuMemsetD8 (dstDevice, uc, N);
+
+  if (CU_err != CUDA_SUCCESS)
+  {
+    const char *pStr = NULL;
+
+    if (cuda->cuGetErrorString (CU_err, &pStr) == CUDA_SUCCESS)
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "cuMemsetD8(): %d", CU_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_cuMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
--- a/src/ext_hip.c
+++ b/src/ext_hip.c
@ -115,47 +115,55 @@ int hip_init (void *hashcat_ctx)
  // a good reference is cuda.h itself
  // this needs to be verified for each new cuda release

-  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,               HIP_HIPCTXCREATE,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,              HIP_HIPCTXDESTROY,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,           HIP_HIPCTXPOPCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,          HIP_HIPCTXPUSHCURRENT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,           HIP_HIPCTXSETCURRENT,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,          HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,               HIP_HIPDEVICEGET,               HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,      HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,          HIP_HIPDEVICEGETCOUNT,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,           HIP_HIPDEVICEGETNAME,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,          HIP_HIPDEVICETOTALMEM,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,        HIP_HIPDRIVERGETVERSION,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreateWithFlags,    HIP_HIPEVENTCREATE,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,            HIP_HIPEVENTDESTROY,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,        HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,             HIP_HIPEVENTRECORD,             HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,        HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,        HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipGetErrorName,            HIP_HIPGETERRORNAME,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipGetErrorString,          HIP_HIPGETERRORSTRING,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                    HIP_HIPINIT,                    HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,      HIP_HIPLAUNCHKERNEL,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                  HIP_HIPMEMALLOC,                HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                    HIP_HIPMEMFREE,                 HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,              HIP_HIPMEMGETINFO,              HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,         HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,         HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,          HIP_HIPMEMSETD32ASYNC,          HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,           HIP_HIPMEMSETD8ASYNC,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,         HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,       HIP_HIPMODULEGETFUNCTION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,         HIP_HIPMODULEGETGLOBAL,         HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,        HIP_HIPMODULELOADDATAEX,        HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,            HIP_HIPMODULEUNLOAD,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,       HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,            HIP_HIPSTREAMCREATE,            HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,           HIP_HIPSTREAMDESTROY,           HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,       HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,     HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
-  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,    hipModuleOccupancyMaxActiveBlocksPerMultiprocessor,     HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxCreate,              hipCtxCreate,                 HIP_HIPCTXCREATE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxDestroy,             hipCtxDestroy,                HIP_HIPCTXDESTROY,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPopCurrent,          hipCtxPopCurrent,             HIP_HIPCTXPOPCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxPushCurrent,         hipCtxPushCurrent,            HIP_HIPCTXPUSHCURRENT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSetCurrent,          hipCtxSetCurrent,             HIP_HIPCTXSETCURRENT,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipCtxSynchronize,         hipCtxSynchronize,            HIP_HIPCTXSYNCHRONIZE,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGet,              hipDeviceGet,                 HIP_HIPDEVICEGET,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetAttribute,     hipDeviceGetAttribute,        HIP_HIPDEVICEGETATTRIBUTE,      HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetCount,         hipGetDeviceCount,            HIP_HIPDEVICEGETCOUNT,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceGetName,          hipDeviceGetName,             HIP_HIPDEVICEGETNAME,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDeviceTotalMem,         hipDeviceTotalMem,            HIP_HIPDEVICETOTALMEM,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipDriverGetVersion,       hipDriverGetVersion,          HIP_HIPDRIVERGETVERSION,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreate,            hipEventCreate,               HIP_HIPEVENTCREATE,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventCreateWithFlags,   hipEventCreateWithFlags,      HIP_HIPEVENTCREATEWITHFLAGS,    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventDestroy,           hipEventDestroy,              HIP_HIPEVENTDESTROY,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventElapsedTime,       hipEventElapsedTime,          HIP_HIPEVENTELAPSEDTIME,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventRecord,            hipEventRecord,               HIP_HIPEVENTRECORD,             HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipEventSynchronize,       hipEventSynchronize,          HIP_HIPEVENTSYNCHRONIZE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipFuncGetAttribute,       hipFuncGetAttribute,          HIP_HIPFUNCGETATTRIBUTE,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorName,           hipDrvGetErrorName,           HIP_HIPGETERRORNAME,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetErrorString,         hipDrvGetErrorString,         HIP_HIPGETERRORSTRING,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipInit,                   hipInit,                      HIP_HIPINIT,                    HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipLaunchKernel,           hipModuleLaunchKernel,        HIP_HIPLAUNCHKERNEL,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemAlloc,               hipMalloc,                    HIP_HIPMEMALLOC,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemFree,                hipFree,                      HIP_HIPMEMFREE,                 HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemGetInfo,             hipMemGetInfo,                HIP_HIPMEMGETINFO,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoD,             hipMemcpyDtoD,                HIP_HIPMEMCPYDTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoH,             hipMemcpyDtoH,                HIP_HIPMEMCPYDTOH,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoD,             hipMemcpyHtoD,                HIP_HIPMEMCPYHTOD,              HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32,              hipMemsetD32,                 HIP_HIPMEMSETD32,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8,               hipMemsetD8,                  HIP_HIPMEMSETD8,                HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoDAsync,        hipMemcpyDtoDAsync,           HIP_HIPMEMCPYDTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyDtoHAsync,        hipMemcpyDtoHAsync,           HIP_HIPMEMCPYDTOHASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemcpyHtoDAsync,        hipMemcpyHtoDAsync,           HIP_HIPMEMCPYHTODASYNC,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD32Async,         hipMemsetD32Async,            HIP_HIPMEMSETD32ASYNC,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipMemsetD8Async,          hipMemsetD8Async,             HIP_HIPMEMSETD8ASYNC,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetFunction,      hipModuleGetFunction,         HIP_HIPMODULEGETFUNCTION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleGetGlobal,        hipModuleGetGlobal,           HIP_HIPMODULEGETGLOBAL,         HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleLoadDataEx,       hipModuleLoadDataEx,          HIP_HIPMODULELOADDATAEX,        HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleUnload,           hipModuleUnload,              HIP_HIPMODULEUNLOAD,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipRuntimeGetVersion,      hipRuntimeGetVersion,         HIP_HIPRUNTIMEGETVERSION,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDevice,              hipSetDevice,                 HIP_HIPSETDEVICE,               HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipSetDeviceFlags,         hipSetDeviceFlags,            HIP_HIPSETDEVICEFLAGS,          HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreate,           hipStreamCreate,              HIP_HIPSTREAMCREATE,            HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamCreateWithFlags,  hipStreamCreateWithFlags,     HIP_HIPSTREAMCREATEWITHFLAGS,   HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamDestroy,          hipStreamDestroy,             HIP_HIPSTREAMDESTROY,           HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipStreamSynchronize,      hipStreamSynchronize,         HIP_HIPSTREAMSYNCHRONIZE,       HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipGetDeviceProperties,    hipGetDevicePropertiesR0600,  HIP_HIPGETDEVICEPROPERTIES,     HIP, 1);
+  HC_LOAD_FUNC_HIP (hip, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, HIP_HIPMODULEOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR, HIP, 1);

  return 0;
 }
@ -503,13 +511,13 @@ int hc_hipDriverGetVersion (void *hashcat_ctx, int *driverVersion)
  return 0;
 }

-int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flags)
+int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;

  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;

-  const hipError_t HIP_err = hip->hipEventCreate (phEvent, Flags);
+  const hipError_t HIP_err = hip->hipEventCreate (phEvent);

  if (HIP_err != hipSuccess)
  {
@ -530,6 +538,33 @@ int hc_hipEventCreate (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int Flag
  return 0;
 }

+int hc_hipEventCreateWithFlags (void *hashcat_ctx, hipEvent_t *phEvent, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipEventCreateWithFlags (phEvent, flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipEventCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipEventDestroy (void *hashcat_ctx, hipEvent_t hEvent)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@ -800,6 +835,143 @@ int hc_hipMemGetInfo (void *hashcat_ctx, size_t *free, size_t *total)
  return 0;
 }

+
+
+int hc_hipMemcpyDtoH (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoH (dstHost, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoH(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyDtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyDtoD (dstDevice, srcDevice, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyDtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemcpyHtoD (void *hashcat_ctx, hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemcpyHtoD (dstDevice, srcHost, ByteCount);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemcpyHtoD(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD32 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned int ui, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD32 (dstDevice, ui, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD32(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipMemsetD8 (void *hashcat_ctx, hipDeviceptr_t dstDevice, unsigned char uc, size_t N)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipMemsetD8 (dstDevice, uc, N);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipMemsetD8(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipMemcpyDtoHAsync (void *hashcat_ctx, void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
@ -1070,13 +1242,67 @@ int hc_hipRuntimeGetVersion (void *hashcat_ctx, int *runtimeVersion)
  return 0;
 }

-int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+int hc_hipSetDevice (void *hashcat_ctx, hipDevice_t dev)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;

  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;

-  const hipError_t HIP_err = hip->hipStreamCreate (phStream, Flags);
+  const hipError_t HIP_err = hip->hipSetDevice (dev);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDevice(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipSetDeviceFlags (void *hashcat_ctx, unsigned int flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipSetDeviceFlags (flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipSetDeviceFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
+int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreate (phStream);

  if (HIP_err != hipSuccess)
  {
@ -1097,6 +1323,33 @@ int hc_hipStreamCreate (void *hashcat_ctx, hipStream_t *phStream, unsigned int F
  return 0;
 }

+int hc_hipStreamCreateWithFlags (void *hashcat_ctx, hipStream_t *phStream, unsigned int Flags)
+{
+  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
+
+  HIP_PTR *hip = (HIP_PTR *) backend_ctx->hip;
+
+  const hipError_t HIP_err = hip->hipStreamCreateWithFlags (phStream, Flags);
+
+  if (HIP_err != hipSuccess)
+  {
+    const char *pStr = NULL;
+
+    if (hip->hipGetErrorString (HIP_err, &pStr) == hipSuccess)
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %s", pStr);
+    }
+    else
+    {
+      event_log_error (hashcat_ctx, "hipStreamCreateWithFlags(): %d", HIP_err);
+    }
+
+    return -1;
+  }
+
+  return 0;
+}
+
 int hc_hipStreamDestroy (void *hashcat_ctx, hipStream_t hStream)
 {
  backend_ctx_t *backend_ctx = ((hashcat_ctx_t *) hashcat_ctx)->backend_ctx;
--- a/src/ext_metal.m
+++ b/src/ext_metal.m
@ -195,11 +195,14 @@ static int hc_mtlBuildOptionsToDict (void *hashcat_ctx, const char *build_option
  }

  // if set, add INCLUDE_PATH to hack Apple kernel build from source limitation on -I usage
+
  if (include_path != nil)
  {
    NSString *path_key = @"INCLUDE_PATH";
    NSString *path_value = [NSString stringWithCString: include_path encoding: NSUTF8StringEncoding];
+
    // Include path may contain spaces, escape them with a backslash
+
    path_value = [path_value stringByReplacingOccurrencesOfString:@" " withString:@"\\ "];

    [build_options_dict setObject:path_value forKey:path_key];
@ -743,6 +746,7 @@ int hc_mtlCreateKernel (void *hashcat_ctx, mtl_device_id metal_device, mtl_libra
  dispatch_queue_t queue = dispatch_get_global_queue (DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);

  // if no user-defined runtime, set to METAL_COMPILER_RUNTIME
+
  long timeout = (user_options->metal_compiler_runtime > 0) ? user_options->metal_compiler_runtime : METAL_COMPILER_RUNTIME;

  dispatch_time_t when = dispatch_time (DISPATCH_TIME_NOW,NSEC_PER_SEC * timeout);
@ -1314,10 +1318,21 @@ int hc_mtlSetCommandEncoderArg (void *hashcat_ctx, mtl_command_encoder metal_com
  return 0;
 }

-int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, size_t global_work_size, size_t local_work_size, double *ms)
+int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_command_encoder, mtl_command_buffer metal_command_buffer, const unsigned int work_dim, const size_t global_work_size[3], const size_t local_work_size[3], double *ms)
 {
-  MTLSize numThreadgroups = {local_work_size, 1, 1};
-  MTLSize threadsGroup = {global_work_size, 1, 1};
+  MTLSize threadsPerThreadgroup =
+  {
+    local_work_size[0],
+    local_work_size[1],
+    local_work_size[2]
+  };
+
+  MTLSize threadgroupsPerGrid =
+  {
+    (global_work_size[0] + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
+    work_dim > 1 ? (global_work_size[1] + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height : 1,
+    work_dim > 2 ? (global_work_size[2] + threadsPerThreadgroup.depth - 1) / threadsPerThreadgroup.depth : 1
+  };

  if (metal_command_encoder == nil)
  {
@ -1333,7 +1348,7 @@ int hc_mtlEncodeComputeCommand (void *hashcat_ctx, mtl_command_encoder metal_com
    return -1;
  }

-  [metal_command_encoder dispatchThreadgroups: threadsGroup threadsPerThreadgroup: numThreadgroups];
+  [metal_command_encoder dispatchThreadgroups: threadgroupsPerGrid threadsPerThreadgroup: threadsPerThreadgroup];

  [metal_command_encoder endEncoding];
  [metal_command_buffer commit];
@ -1377,17 +1392,22 @@ int hc_mtlCreateLibraryWithFile (void *hashcat_ctx, mtl_device_id metal_device,

  if (k_string != nil)
  {
-    id <MTLLibrary> r = [metal_device newLibraryWithFile: k_string error: &error];
+    NSURL *libURL = [NSURL fileURLWithPath: k_string];

-    if (error != nil)
+    if (libURL != nil)
    {
-      event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
-      return -1;
+      id <MTLLibrary> r = [metal_device newLibraryWithURL: libURL error:&error];
+
+      if (error != nil)
+      {
+        event_log_error (hashcat_ctx, "%s(): failed to create metal library from metallib, %s", __func__, [[error localizedDescription] UTF8String]);
+        return -1;
+      }
+
+      *metal_library = r;
+
+      return 0;
    }
-
-    *metal_library = r;
-
-    return 0;
  }

  return -1;
@ -1420,10 +1440,17 @@ int hc_mtlCreateLibraryWithSource (void *hashcat_ctx, mtl_device_id metal_device
      }

      compileOptions.preprocessorMacros = build_options_dict;
+      /*
+      compileOptions.optimizationLevel = MTLLibraryOptimizationLevelSize;
+      compileOptions.mathMode = MTLMathModeSafe;
+      // compileOptions.mathMode = MTLMathModeRelaxed;
+      // compileOptions.enableLogging = true;
+      // compileOptions.fastMathEnabled = false;
+      */
    }

    // todo: detect current os version and choose the right
-//    compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
+    // compileOptions.languageVersion = MTL_LANGUAGEVERSION_2_3;
 /*
    if (@available(macOS 12.0, *))
    {
--- a/src/hashcat.c
+++ b/src/hashcat.c
@ -132,6 +132,7 @@ static int inner2_loop (hashcat_ctx_t *hashcat_ctx)
  status_ctx->words_base = status_ctx->words_cnt / amplifier_cnt;

  EVENT (EVENT_CALCULATED_WORDS_BASE);
+  EVENT (EVENT_CALCULATED_WORDS_CNT);

  if (user_options->keyspace == true)
  {
@ -1481,6 +1482,8 @@ bool autodetect_hashmode_test (hashcat_ctx_t *hashcat_ctx)
  {
    char *input_buf = user_options_extra->hc_hash;

+    if (!input_buf) return false;
+
    size_t input_len = strlen (input_buf);

    char  *hash_buf = NULL;
--- a/src/hashes.c
+++ b/src/hashes.c
@ -334,7 +334,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla

    if (device_param->is_cuda == true)
    {
-      rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->cuda_stream);
+      rc = hc_cuMemcpyDtoH (hashcat_ctx, tmps, device_param->cuda_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);

      if (rc == 0)
      {
@ -351,7 +351,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla

    if (device_param->is_hip == true)
    {
-      rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size, device_param->hip_stream);
+      rc = hc_hipMemcpyDtoH (hashcat_ctx, tmps, device_param->hip_d_tmps + (plain->gidvid * hashconfig->tmp_size), hashconfig->tmp_size);

      if (rc == 0)
      {
@ -382,7 +382,7 @@ int check_hash (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, pla

    if (device_param->is_opencl == true)
    {
-      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_FALSE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);
+      rc = hc_clEnqueueReadBuffer (hashcat_ctx, device_param->opencl_command_queue, device_param->opencl_d_tmps, CL_TRUE, plain->gidvid * hashconfig->tmp_size, hashconfig->tmp_size, tmps, 0, NULL, &opencl_event);

      if (rc == 0)
      {
@ -587,14 +587,14 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)

  if (device_param->is_cuda == true)
  {
-    if (hc_cuMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32), device_param->cuda_stream) == -1) return -1;
+    if (hc_cuMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->cuda_d_result, sizeof (u32)) == -1) return -1;

    if (hc_cuStreamSynchronize (hashcat_ctx, device_param->cuda_stream) == -1) return -1;
  }

  if (device_param->is_hip == true)
  {
-    if (hc_hipMemcpyDtoHAsync (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32), device_param->hip_stream) == -1) return -1;
+    if (hc_hipMemcpyDtoH (hashcat_ctx, &num_cracked, device_param->hip_d_result, sizeof (u32)) == -1) return -1;

    if (hc_hipStreamSynchronize (hashcat_ctx, device_param->hip_stream) == -1) return -1;
  }
@ -624,7 +624,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)

  if (device_param->is_cuda == true)
  {
-    rc = hc_cuMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->cuda_stream);
+    rc = hc_cuMemcpyDtoH (hashcat_ctx, cracked, device_param->cuda_d_plain_bufs, num_cracked * sizeof (plain_t));

    if (rc == 0)
    {
@ -641,7 +641,7 @@ int check_cracked (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param)

  if (device_param->is_hip == true)
  {
-    rc = hc_hipMemcpyDtoHAsync (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t), device_param->hip_stream);
+    rc = hc_hipMemcpyDtoH (hashcat_ctx, cracked, device_param->hip_d_plain_bufs, num_cracked * sizeof (plain_t));

    if (rc == 0)
    {
@ -1133,7 +1133,7 @@ int hashes_init_stage1 (hashcat_ctx_t *hashcat_ctx)

    hashes_cnt = 1;
  }
-  else if (user_options->hash_info == true)
+  else if (user_options->hash_info > 0)
  {
  }
  else if (user_options->keyspace == true)
@ -2386,13 +2386,13 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)

        char *st_hash = strdup (tmp_buf);

-        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Incompatible self-test configuration detected.");

        event_log_warning (hashcat_ctx, "The specified target hash:");
        event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of the self-test hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of the self-test hash:");
        event_log_warning (hashcat_ctx, "  -> %s", st_hash);
-        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this SCRYPT configuration may be incompatible.");
+        event_log_warning (hashcat_ctx, "The JIT-compiled kernel for this configuration may be incompatible.");
        event_log_warning (hashcat_ctx, "You must disable the self-test functionality or recompile the plugin with a matching self-test hash.");
        event_log_warning (hashcat_ctx, "To disable the self-test, use the --self-test-disable option.");
        event_log_warning (hashcat_ctx, NULL);
@ -2414,11 +2414,11 @@ int hashes_init_stage5 (hashcat_ctx_t *hashcat_ctx)

        char *user_hash2 = strdup (tmp_buf);

-        event_log_error (hashcat_ctx, "ERROR: Mixed SCRYPT configuration detected.");
+        event_log_error (hashcat_ctx, "ERROR: Mixed configuration detected.");

        event_log_warning (hashcat_ctx, "The specified target hash:");
        event_log_warning (hashcat_ctx, "  -> %s", user_hash);
-        event_log_warning (hashcat_ctx, "does not match the SCRYPT configuration of another target hash:");
+        event_log_warning (hashcat_ctx, "does not match the configuration of another target hash:");
        event_log_warning (hashcat_ctx, "  -> %s", user_hash2);
        event_log_warning (hashcat_ctx, "Please run these hashes in separate cracking sessions.");
        event_log_warning (hashcat_ctx, NULL);
--- a/src/hwmon.c
+++ b/src/hwmon.c
@ -1268,142 +1268,10 @@ u64 hm_get_memoryused_with_devices_idx (hashcat_ctx_t *hashcat_ctx, const int ba
  return 0;
 }

-int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+static void hwmon_ctx_init_nvml (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvml, int backend_devices_cnt)
 {
-  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
-  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
-  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
-  user_options_t *user_options = hashcat_ctx->user_options;
-
-  hwmon_ctx->enabled = false;
-
-  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
-
-  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
-
-  //#if !defined (WITH_HWMON)
-  //return 0;
-  //#endif // WITH_HWMON
-
-  if (user_options->usage          > 0)     return 0;
-  //if (user_options->backend_info   > 0)     return 0;
-
-  if (user_options->hash_info     == true)  return 0;
-  if (user_options->keyspace      == true)  return 0;
-  if (user_options->left          == true)  return 0;
-  if (user_options->show          == true)  return 0;
-  if (user_options->stdout_flag   == true)  return 0;
-  if (user_options->version       == true)  return 0;
-  if (user_options->identify      == true)  return 0;
-  //we need hwmon support to get free memory per device support
-  //its a joke, but there's no way around
-  //if (user_options->hwmon         == false) return 0;
-
-  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  /**
-   * Initialize shared libraries
-   */
-
-  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
-
-  #define FREE_ADAPTERS                \
-  do {                                 \
-    hcfree (hm_adapters_adl);          \
-    hcfree (hm_adapters_nvapi);        \
-    hcfree (hm_adapters_nvml);         \
-    hcfree (hm_adapters_sysfs_amdgpu); \
-    hcfree (hm_adapters_sysfs_cpu);    \
-    hcfree (hm_adapters_iokit);        \
-  } while (0)
-
-  if (backend_ctx->need_nvml == true)
-  {
-    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
-
-    if (nvml_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvml);
-
-      hwmon_ctx->hm_nvml = NULL;
-    }
-  }
-
-  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
-  {
-    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
-
-    if (nvapi_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_nvapi);
-
-      hwmon_ctx->hm_nvapi = NULL;
-    }
-  }
-
-  if (backend_ctx->need_adl == true)
-  {
-    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
-
-    if (adl_init (hashcat_ctx) == -1)
-    {
-      hcfree (hwmon_ctx->hm_adl);
-
-      hwmon_ctx->hm_adl = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_amdgpu == true)
-  {
-    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
-
-    if (sysfs_amdgpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-
-    // also if there's ADL, we don't need sysfs_amdgpu
-
-    if (hwmon_ctx->hm_adl)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
-
-      hwmon_ctx->hm_sysfs_amdgpu = NULL;
-    }
-  }
-
-  if (backend_ctx->need_sysfs_cpu == true)
-  {
-    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
-
-    if (sysfs_cpu_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_sysfs_cpu);
-
-      hwmon_ctx->hm_sysfs_cpu = NULL;
-    }
-  }
-
-  #if defined(__APPLE__)
-  if (backend_ctx->need_iokit == true)
-  {
-    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
-
-    if (iokit_init (hashcat_ctx) == false)
-    {
-      hcfree (hwmon_ctx->hm_iokit);
-
-      hwmon_ctx->hm_iokit = NULL;
-    }
-  }
-  #endif
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;

  if (hwmon_ctx->hm_nvml)
  {
@ -1485,6 +1353,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
      hcfree (nvmlGPUHandle);
    }
  }
+}
+
+static void hwmon_ctx_init_nvapi (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_nvapi, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;

  if (hwmon_ctx->hm_nvapi)
  {
@ -1558,6 +1432,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
      hcfree (nvGPUHandle);
    }
  }
+}
+
+static int hwmon_ctx_init_adl (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_adl, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;

  if (hwmon_ctx->hm_adl)
  {
@ -1567,23 +1447,13 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)

      int tmp_in;

-      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (get_adapters_num_adl (hashcat_ctx, &tmp_in) == -1) return -1;

      // adapter info

      LPAdapterInfo lpAdapterInfo = (LPAdapterInfo) hccalloc (tmp_in, sizeof (AdapterInfo));

-      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1)
-      {
-        FREE_ADAPTERS;
-
-        return -1;
-      }
+      if (hm_ADL_Adapter_AdapterInfo_Get (hashcat_ctx, lpAdapterInfo, tmp_in * sizeof (AdapterInfo)) == -1) return -1;

      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
      {
@ -1639,107 +1509,260 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
    }
  }

+  return 0;
+}
+
+static void hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_amdgpu, hm_attrs_t *hm_adapters_iokit, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;
+
  if (hwmon_ctx->hm_sysfs_amdgpu || hwmon_ctx->hm_iokit)
  {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
    {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
      {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+        // nothing to do
+      }

-        if (device_param->skipped == true) continue;
+      #if defined (__APPLE__)
+      if (device_param->is_metal == true)
+      {
+        const u32 device_id = device_param->device_id;

-        if (device_param->is_cuda == true)
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
        {
-          // nothing to do
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
+        }
+      }
+      #endif
+
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
+        {
+          hm_adapters_iokit[device_id].buslanes_get_supported    = false;
+          hm_adapters_iokit[device_id].corespeed_get_supported   = false;
+          hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
+          hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
+          hm_adapters_iokit[device_id].temperature_get_supported = true;
+          hm_adapters_iokit[device_id].utilization_get_supported = true;
        }

-        #if defined (__APPLE__)
-        if (device_param->is_metal == true)
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_amdgpu)
        {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-        }
-        #endif
-
-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
-        {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (hwmon_ctx->hm_iokit))
-          {
-            hm_adapters_iokit[device_id].buslanes_get_supported    = false;
-            hm_adapters_iokit[device_id].corespeed_get_supported   = false;
-            hm_adapters_iokit[device_id].fanspeed_get_supported    = true;
-            hm_adapters_iokit[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_iokit[device_id].memoryspeed_get_supported = false;
-            hm_adapters_iokit[device_id].temperature_get_supported = true;
-            hm_adapters_iokit[device_id].utilization_get_supported = true;
-          }
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_amdgpu)
-          {
-            hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
-            hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
-            hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
-          }
+          hm_adapters_sysfs_amdgpu[device_id].buslanes_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].corespeed_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanspeed_get_supported    = true;
+          hm_adapters_sysfs_amdgpu[device_id].fanpolicy_get_supported   = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryspeed_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].utilization_get_supported = true;
+          hm_adapters_sysfs_amdgpu[device_id].memoryused_get_supported  = true;
        }
      }
    }
  }
+}
+
+static void hwmon_ctx_init_sysfs_cpu (hashcat_ctx_t *hashcat_ctx, hm_attrs_t *hm_adapters_sysfs_cpu, int backend_devices_cnt)
+{
+  backend_ctx_t *backend_ctx = hashcat_ctx->backend_ctx;
+  hwmon_ctx_t   *hwmon_ctx   = hashcat_ctx->hwmon_ctx;

  if (hwmon_ctx->hm_sysfs_cpu)
  {
-    if (true)
+    for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
    {
-      for (int backend_devices_idx = 0; backend_devices_idx < backend_devices_cnt; backend_devices_idx++)
+      hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
+
+      if (device_param->skipped == true) continue;
+
+      if (device_param->is_cuda == true)
      {
-        hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_idx];
-
-        if (device_param->skipped == true) continue;
-
-        if (device_param->is_cuda == true)
-        {
          // nothing to do
-        }
+      }

-        if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      if ((device_param->is_opencl == true) || (device_param->is_hip == true))
+      {
+        const u32 device_id = device_param->device_id;
+
+        if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
+
+        if (hwmon_ctx->hm_sysfs_cpu)
        {
-          const u32 device_id = device_param->device_id;
-
-          if ((device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) == 0) continue;
-
-          if (hwmon_ctx->hm_sysfs_cpu)
-          {
-            hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
-            hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
-            hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
-            hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
-            hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
-          }
+          hm_adapters_sysfs_cpu[device_id].buslanes_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].corespeed_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].fanspeed_get_supported    = false;
+          hm_adapters_sysfs_cpu[device_id].fanpolicy_get_supported   = false;
+          hm_adapters_sysfs_cpu[device_id].memoryspeed_get_supported = false;
+          hm_adapters_sysfs_cpu[device_id].temperature_get_supported = true;
+          hm_adapters_sysfs_cpu[device_id].utilization_get_supported = true;
        }
      }
    }
  }
+}
+
+int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
+{
+  bridge_ctx_t   *bridge_ctx   = hashcat_ctx->bridge_ctx;
+  hwmon_ctx_t    *hwmon_ctx    = hashcat_ctx->hwmon_ctx;
+  backend_ctx_t  *backend_ctx  = hashcat_ctx->backend_ctx;
+  user_options_t *user_options = hashcat_ctx->user_options;
+
+  hwmon_ctx->enabled = false;
+
+  int backend_devices_cnt = backend_ctx->backend_devices_cnt;
+
+  if (bridge_ctx->enabled == true) backend_devices_cnt = 1;
+
+  //#if !defined (WITH_HWMON)
+  //return 0;
+  //#endif // WITH_HWMON
+
+  if (user_options->usage          > 0)     return 0;
+  if (user_options->hash_info      > 0)     return 0;
+  //if (user_options->backend_info   > 0)     return 0;
+
+  if (user_options->keyspace      == true)  return 0;
+  if (user_options->left          == true)  return 0;
+  if (user_options->show          == true)  return 0;
+  if (user_options->stdout_flag   == true)  return 0;
+  if (user_options->version       == true)  return 0;
+  if (user_options->identify      == true)  return 0;
+  //we need hwmon support to get free memory per device support
+  //its a joke, but there's no way around
+  //if (user_options->hwmon         == false) return 0;
+
+  hwmon_ctx->hm_device = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  /**
+   * Initialize shared libraries
+   */
+
+  hm_attrs_t *hm_adapters_adl           = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvapi         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_nvml          = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_amdgpu  = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_sysfs_cpu     = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+  hm_attrs_t *hm_adapters_iokit         = (hm_attrs_t *) hccalloc (DEVICES_MAX, sizeof (hm_attrs_t));
+
+  if (backend_ctx->need_nvml == true)
+  {
+    hwmon_ctx->hm_nvml = (NVML_PTR *) hcmalloc (sizeof (NVML_PTR));
+
+    if (nvml_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvml);
+
+      hwmon_ctx->hm_nvml = NULL;
+    }
+  }
+
+  if ((backend_ctx->need_nvapi == true) && (hwmon_ctx->hm_nvml)) // nvapi can't work alone, we need nvml, too
+  {
+    hwmon_ctx->hm_nvapi = (NVAPI_PTR *) hcmalloc (sizeof (NVAPI_PTR));
+
+    if (nvapi_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_nvapi);
+
+      hwmon_ctx->hm_nvapi = NULL;
+    }
+  }
+
+  if (backend_ctx->need_adl == true)
+  {
+    hwmon_ctx->hm_adl = (ADL_PTR *) hcmalloc (sizeof (ADL_PTR));
+
+    if (adl_init (hashcat_ctx) == -1)
+    {
+      hcfree (hwmon_ctx->hm_adl);
+
+      hwmon_ctx->hm_adl = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_amdgpu == true)
+  {
+    hwmon_ctx->hm_sysfs_amdgpu = (SYSFS_AMDGPU_PTR *) hcmalloc (sizeof (SYSFS_AMDGPU_PTR));
+
+    if (sysfs_amdgpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+      hwmon_ctx->hm_sysfs_amdgpu = NULL;
+    }
+  }
+
+  if (backend_ctx->need_sysfs_cpu == true)
+  {
+    hwmon_ctx->hm_sysfs_cpu = (SYSFS_CPU_PTR *) hcmalloc (sizeof (SYSFS_CPU_PTR));
+
+    if (sysfs_cpu_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_sysfs_cpu);
+
+      hwmon_ctx->hm_sysfs_cpu = NULL;
+    }
+  }
+
+  #if defined(__APPLE__)
+  if (backend_ctx->need_iokit == true)
+  {
+    hwmon_ctx->hm_iokit = (IOKIT_PTR *) hcmalloc (sizeof (IOKIT_PTR));
+
+    if (iokit_init (hashcat_ctx) == false)
+    {
+      hcfree (hwmon_ctx->hm_iokit);
+
+      hwmon_ctx->hm_iokit = NULL;
+    }
+  }
+  #endif
+
+  hwmon_ctx_init_nvml  (hashcat_ctx, hm_adapters_nvml,  backend_devices_cnt);
+
+  hwmon_ctx_init_nvapi (hashcat_ctx, hm_adapters_nvapi, backend_devices_cnt);
+
+  // if ADL init fail, disable
+
+  if (hwmon_ctx_init_adl (hashcat_ctx, hm_adapters_adl, backend_devices_cnt) == -1)
+  {
+    hcfree (hwmon_ctx->hm_adl);
+
+    hwmon_ctx->hm_adl = NULL;
+  }
+
+  // if there's ADL, we don't need sysfs_amdgpu
+
+  if (hwmon_ctx->hm_adl)
+  {
+    hcfree (hwmon_ctx->hm_sysfs_amdgpu);
+
+    hwmon_ctx->hm_sysfs_amdgpu = NULL;
+  }
+
+  hwmon_ctx_init_sysfs_amdgpu_iokit (hashcat_ctx, hm_adapters_sysfs_amdgpu, hm_adapters_iokit, backend_devices_cnt);
+
+  hwmon_ctx_init_sysfs_cpu (hashcat_ctx, hm_adapters_sysfs_cpu, backend_devices_cnt);

  #if defined(__APPLE__)
  if (backend_ctx->need_iokit == true)
@ -1757,7 +1780,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (hwmon_ctx->hm_adl == NULL && hwmon_ctx->hm_nvml == NULL && hwmon_ctx->hm_sysfs_amdgpu == NULL && hwmon_ctx->hm_sysfs_cpu == NULL && hwmon_ctx->hm_iokit == NULL)
  {
-    FREE_ADAPTERS;
+    hcfree (hm_adapters_adl);
+    hcfree (hm_adapters_nvapi);
+    hcfree (hm_adapters_nvml);
+    hcfree (hm_adapters_sysfs_amdgpu);
+    hcfree (hm_adapters_sysfs_cpu);
+    hcfree (hm_adapters_iokit);

    return 0;
  }
@ -1992,7 +2020,12 @@ int hwmon_ctx_init (hashcat_ctx_t *hashcat_ctx)
    hm_get_memoryused_with_devices_idx         (hashcat_ctx, backend_devices_idx);
  }

-  FREE_ADAPTERS;
+  hcfree (hm_adapters_adl);
+  hcfree (hm_adapters_nvapi);
+  hcfree (hm_adapters_nvml);
+  hcfree (hm_adapters_sysfs_amdgpu);
+  hcfree (hm_adapters_sysfs_cpu);
+  hcfree (hm_adapters_iokit);

  return 0;
 }
--- a/src/induct.c
+++ b/src/induct.c
@ -41,9 +41,9 @@ int induct_ctx_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage          > 0)    return 0;
  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;

  if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
  if (user_options->keyspace      == true) return 0;
  if (user_options->left          == true) return 0;
  if (user_options->show          == true) return 0;
--- a/src/interface.c
+++ b/src/interface.c
@ -363,7 +363,7 @@ int hashconfig_init (hashcat_ctx_t *hashcat_ctx)

    hashconfig->has_optimized_kernel = hc_path_read (source_file);

-    if (user_options->hash_info == false)
+    if (user_options->hash_info == 0 || user_options->hash_info > 1)
    {
      if (user_options->optimized_kernel == true)
      {
--- a/src/loopback.c
+++ b/src/loopback.c
@ -62,9 +62,9 @@ int loopback_init (hashcat_ctx_t *hashcat_ctx)

  if (user_options->usage          > 0)    return 0;
  if (user_options->backend_info   > 0)    return 0;
+  if (user_options->hash_info      > 0)    return 0;

  if (user_options->benchmark     == true) return 0;
-  if (user_options->hash_info     == true) return 0;
  if (user_options->keyspace      == true) return 0;
  if (user_options->left          == true) return 0;
  if (user_options->show          == true) return 0;
--- a/src/main.c
+++ b/src/main.c
@ -192,12 +192,13 @@ static void main_outerloop_starting (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MA

  status_ctx->shutdown_outer = false;

-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
  if (user_options->keyspace     == true) return;
  if (user_options->stdout_flag  == true) return;
  if (user_options->speed_only   == true) return;
  if (user_options->identify     == true) return;
-  if (user_options->backend_info  > 0)    return;

  if ((user_options_extra->wordlist_mode == WL_MODE_FILE) || (user_options_extra->wordlist_mode == WL_MODE_MASK))
  {
@ -269,10 +270,11 @@ static void main_cracker_finished (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYB
  const user_options_t       *user_options       = hashcat_ctx->user_options;
  const user_options_extra_t *user_options_extra = hashcat_ctx->user_options_extra;

-  if (user_options->hash_info    == true) return;
+  if (user_options->backend_info  > 0)    return;
+  if (user_options->hash_info     > 0)    return;
+
  if (user_options->keyspace     == true) return;
  if (user_options->stdout_flag  == true) return;
-  if (user_options->backend_info  > 0)    return;

  // if we had a prompt, clear it

@ -370,10 +372,22 @@ static void main_calculated_words_base (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx,
  const user_options_t *user_options = hashcat_ctx->user_options;

  if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == true) return;

  event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_base);
 }

+static void main_calculated_words_cnt (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
+{
+  const status_ctx_t   *status_ctx   = hashcat_ctx->status_ctx;
+  const user_options_t *user_options = hashcat_ctx->user_options;
+
+  if (user_options->keyspace == false) return;
+  if (user_options->total_candidates == false) return;
+
+  event_log_info (hashcat_ctx, "%" PRIu64 "", status_ctx->words_cnt);
+}
+
 static void main_potfile_remove_parse_pre (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE_UNUSED const void *buf, MAYBE_UNUSED const size_t len)
 {
  const user_options_t *user_options = hashcat_ctx->user_options;
@ -631,7 +645,17 @@ static void main_backend_session_hostmem (MAYBE_UNUSED hashcat_ctx_t *hashcat_ct

  const u64 *hostmem = (const u64 *) buf;

-  event_log_info (hashcat_ctx, "Host memory required for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  u64 free_memory = 0;
+
+  if (get_free_memory (&free_memory) == false)
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB", *hostmem / (1024 * 1024));
+  }
+  else
+  {
+    event_log_info (hashcat_ctx, "Host memory allocated for this attack: %" PRIu64 " MB (%" PRIu64 " MB free)", *hostmem / (1024 * 1024), free_memory / (1024 * 1024));
+  }
+
  event_log_info (hashcat_ctx, NULL);
 }

@ -996,7 +1020,7 @@ static void main_hashconfig_post (MAYBE_UNUSED hashcat_ctx_t *hashcat_ctx, MAYBE

  if (hashconfig->is_salted == true)
  {
-    if (hashconfig->opti_type & OPTI_TYPE_RAW_HASH)
+    if (hashconfig->opti_type & OPTI_TYPE_RAW_HASH || hashconfig->salt_type & SALT_TYPE_GENERIC)
    {
      event_log_info (hashcat_ctx, "Minimum salt length supported by kernel: %u", hashconfig->salt_min);
      event_log_info (hashcat_ctx, "Maximum salt length supported by kernel: %u", hashconfig->salt_max);
@ -1251,6 +1275,7 @@ static void event (const u32 id, hashcat_ctx_t *hashcat_ctx, const void *buf, co
    case EVENT_BRIDGES_SALT_POST:         main_bridges_salt_post         (hashcat_ctx, buf, len); break;
    case EVENT_BRIDGES_SALT_PRE:          main_bridges_salt_pre          (hashcat_ctx, buf, len); break;
    case EVENT_CALCULATED_WORDS_BASE:     main_calculated_words_base     (hashcat_ctx, buf, len); break;
+    case EVENT_CALCULATED_WORDS_CNT:      main_calculated_words_cnt      (hashcat_ctx, buf, len); break;
    case EVENT_CRACKER_FINISHED:          main_cracker_finished          (hashcat_ctx, buf, len); break;
    case EVENT_CRACKER_HASH_CRACKED:      main_cracker_hash_cracked      (hashcat_ctx, buf, len); break;
    case EVENT_CRACKER_STARTING:          main_cracker_starting          (hashcat_ctx, buf, len); break;
@ -1408,7 +1433,7 @@ int main (int argc, char **argv)

      rc_final = 0;
    }
-    else if (user_options->hash_info == true)
+    else if (user_options->hash_info > 0)
    {
      hash_info (hashcat_ctx);

--- a/src/modules/argon2_common.c
+++ b/src/modules/argon2_common.c
@ -0,0 +1,150 @@
+/**
+ * Author......: Netherlands Forensic Institute
+ * License.....: MIT
+ */
+
+#include <inttypes.h>
+#include "common.h"
+#include "types.h"
+#include "modules.h"
+#include "bitops.h"
+#include "convert.h"
+#include "shared.h"
+#include "memory.h"
+
+#define ARGON2_SYNC_POINTS  4
+#define ARGON2_BLOCK_SIZE   1024
+
+u64 argon2_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u64 tmp_size = 0; // we'll add some later
+
+  return tmp_size;
+}
+
+u64 get_largest_memory_block_count (const hashes_t *hashes)
+{
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  u64 largest_memory_block_count = (options_st == NULL) ? options->memory_block_count : options_st->memory_block_count;
+
+  for (u32 i = 0; i < hashes->salts_cnt; i++)
+  {
+    largest_memory_block_count = MAX (largest_memory_block_count, options->memory_block_count);
+
+    options++;
+  }
+
+  return largest_memory_block_count;
+}
+
+const char *argon2_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
+{
+  hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
+
+  const u32 memory_block_count = get_largest_memory_block_count (hashes);
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  int   lines_sz  = 4096;
+  char *lines_buf = hcmalloc (lines_sz);
+  int   lines_pos = 0;
+
+  const u32 device_processors = device_param->device_processors;
+
+  const u32 device_maxworkgroup_size = device_param->device_maxworkgroup_size;
+
+  const u64 fixed_mem = (256 * 1024 * 1024); // some storage we need for pws[], tmps[], and others. Is around 72MiB in reality.
+
+  const u64 spill_mem = 2048 * device_processors * device_maxworkgroup_size; // 1600 according to ptxas
+
+  const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - (fixed_mem + spill_mem);
+
+  u32 kernel_accel_new = device_processors;
+
+  if (kernel_accel_user)
+  {
+    kernel_accel_new = kernel_accel_user;
+  }
+  else
+  {
+    if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
+    {
+      kernel_accel_new = available_mem / size_per_accel;
+
+      kernel_accel_new = MIN (kernel_accel_new, 1024); // 1024 = max supported
+    }
+  }
+
+  char *new_device_name = hcstrdup (device_param->device_name);
+
+  for (size_t i = 0; i < strlen (new_device_name); i++)
+  {
+    if (new_device_name[i] == ' ') new_device_name[i] = '_';
+  }
+
+  lines_pos += snprintf (lines_buf + lines_pos, lines_sz - lines_pos, "%s * %u 1 %u A\n", new_device_name, user_options->hash_mode, kernel_accel_new);
+
+  hcfree (new_device_name);
+
+  return lines_buf;
+}
+
+u64 argon2_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  const u32 memory_block_count = get_largest_memory_block_count (hashes);
+
+  const u64 size_per_accel = ARGON2_BLOCK_SIZE * memory_block_count;
+
+  const u64 size_argon2 = device_param->kernel_accel_max * size_per_accel;
+
+  return size_argon2;
+}
+
+u64 argon2_module_extra_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes)
+{
+  /*
+  argon2_options_t *options    = (argon2_options_t *) hashes->esalts_buf;
+  argon2_options_t *options_st = (argon2_options_t *) hashes->st_esalts_buf;
+
+  const u32 memory_block_count = (options->memory_block_count) ? options->memory_block_count : options_st->memory_block_count;
+  const u32 parallelism        = (options->parallelism)        ? options->parallelism        : options_st->parallelism;
+
+  for (u32 i = 1; i < hashes->salts_cnt; i++)
+  {
+    if ((memory_block_count != options[i].memory_block_count)
+     || (parallelism        != options[i].parallelism))
+    {
+      return (1ULL << 63) + i;
+    }
+  }
+
+  // now that we know they all have the same settings, we also need to check the self-test hash is different to what the user hash is using
+
+  if ((hashconfig->opts_type & OPTS_TYPE_SELF_TEST_DISABLE) == 0)
+  {
+    if ((memory_block_count != options_st->memory_block_count)
+     || (parallelism        != options_st->parallelism))
+    {
+      return (1ULL << 62);
+    }
+  }
+  */
+
+  u64 tmp_size = sizeof (argon2_tmp_t);
+
+  return tmp_size;
+}
+
+char *argon2_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
+{
+  //argon2_options_t *options = (argon2_options_t *) hashes->esalts_buf;
+
+  char *jit_build_options = NULL;
+
+  //hc_asprintf (&jit_build_options, "-D ARGON2_PARALLELISM=%u -D ARGON2_TMP_ELEM=%u", options[0].parallelism, options[0].memory_block_count);
+
+  return jit_build_options;
+}
+
--- a/src/modules/module_00020.c
+++ b/src/modules/module_00020.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH_SALTED;
 static const char *HASH_NAME      = "md5($salt.$pass)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_00021.c
+++ b/src/modules/module_00021.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_FORUM_SOFTWARE;
 static const char *HASH_NAME      = "osCommerce, xt:Commerce";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_00022.c
+++ b/src/modules/module_00022.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "Juniper NetScreen/SSG (ScreenOS)";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_00023.c
+++ b/src/modules/module_00023.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_IMS;
 static const char *HASH_NAME      = "Skype";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_00024.c
+++ b/src/modules/module_00024.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_EAS;
 static const char *HASH_NAME      = "SolarWinds Serv-U";
 static const u64   KERN_TYPE      = 20;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_01300.c
+++ b/src/modules/module_01300.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_RAW_HASH;
 static const char *HASH_NAME      = "SHA2-224";
 static const u64   KERN_TYPE      = 1300;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_PRECOMPUTE_INIT
                                  | OPTI_TYPE_EARLY_SKIP
                                  | OPTI_TYPE_NOT_ITERATED
--- a/src/modules/module_01500.c
+++ b/src/modules/module_01500.c
@ -20,7 +20,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_4; // originally DGST_SIZE_4_2
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "descrypt, DES (Unix), Traditional DES";
 static const u64   KERN_TYPE      = 1500;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE
                                  | OPTS_TYPE_TM_KERNEL
--- a/src/modules/module_01800.c
+++ b/src/modules/module_01800.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "sha512crypt $6$, SHA512 (Unix)";
 static const u64   KERN_TYPE      = 1800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_USES_BITS_64;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE;
--- a/src/modules/module_06400.c
+++ b/src/modules/module_06400.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_OS;
 static const char *HASH_NAME      = "AIX {ssha256}";
 static const u64   KERN_TYPE      = 6400;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE;
--- a/src/modules/module_06800.c
+++ b/src/modules/module_06800.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "LastPass + LastPass sniffed";
 static const u64   KERN_TYPE      = 6800;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE;
--- a/src/modules/module_07100.c
+++ b/src/modules/module_07100.c
@ -85,15 +85,6 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con
  return pw_max;
 }

-char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
-{
-  char *jit_build_options = NULL;
-
-  hc_asprintf (&jit_build_options, "-D NO_UNROLL");
-
-  return jit_build_options;
-}
-
 int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
 {
  u64 *digest = (u64 *) digest_buf;
@ -398,7 +389,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_hook23                   = MODULE_DEFAULT;
  module_ctx->module_hook_salt_size           = MODULE_DEFAULT;
  module_ctx->module_hook_size                = MODULE_DEFAULT;
-  module_ctx->module_jit_build_options        = module_jit_build_options;
+  module_ctx->module_jit_build_options        = MODULE_DEFAULT;
  module_ctx->module_jit_cache_disable        = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
--- a/src/modules/module_07700.c
+++ b/src/modules/module_07700.c
@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE)";
 static const u64   KERN_TYPE      = 7700;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                  | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE
--- a/src/modules/module_07701.c
+++ b/src/modules/module_07701.c
@ -21,6 +21,7 @@ static const char *HASH_NAME      = "SAP CODVN B (BCODE) from RFC_READ_TABLE";
 static const u64   KERN_TYPE      = 7701;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
                                  | OPTI_TYPE_PRECOMPUTE_INIT
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_NOT_ITERATED;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_LE
--- a/src/modules/module_08200.c
+++ b/src/modules/module_08200.c
@ -20,6 +20,7 @@ static const u32   HASH_CATEGORY  = HASH_CATEGORY_PASSWORD_MANAGER;
 static const char *HASH_NAME      = "1Password, cloudkeychain";
 static const u64   KERN_TYPE      = 8200;
 static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT
                                  | OPTI_TYPE_USES_BITS_64
                                  | OPTI_TYPE_SLOW_HASH_SIMD_LOOP;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
--- a/src/modules/module_08300.c
+++ b/src/modules/module_08300.c
@ -19,7 +19,8 @@ static const u32   DGST_SIZE      = DGST_SIZE_4_5;
 static const u32   HASH_CATEGORY  = HASH_CATEGORY_NETWORK_SERVER;
 static const char *HASH_NAME      = "DNSSEC (NSEC3)";
 static const u64   KERN_TYPE      = 8300;
-static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE;
+static const u32   OPTI_TYPE      = OPTI_TYPE_ZERO_BYTE
+                                  | OPTI_TYPE_REGISTER_LIMIT;
 static const u64   OPTS_TYPE      = OPTS_TYPE_STOCK_MODULE
                                  | OPTS_TYPE_PT_GENERATE_BE
                                  | OPTS_TYPE_ST_HEX
--- a/src/modules/module_10700.c
+++ b/src/modules/module_10700.c
@ -109,11 +109,11 @@ bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
 {
  if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU))
  {
-    if (device_param->is_metal == false)
+    if (device_param->is_metal == true)
    {
-      if (strncmp (device_param->device_name, "Apple M", 7) == 0)
+      if (strncmp (device_param->device_name, "Intel", 5) == 0)
      {
-        // AppleM1, OpenCL, MTLCompilerService, createKernel never-end with pure kernel and newComputePipelineState failed with optimized kernel
+        // Intel Iris Graphics, Metal Version 244.303: failed to create 'm10700_loop' pipeline, timeout reached (status 49)
        return true;
      }
    }
@ -152,38 +152,47 @@ u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED con

 char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
 {
+  const u32 shared_size_scratch = (32 + 64 + 16); // LOCAL_VK u32 s_sc[FIXED_LOCAL_SIZE][PWMAXSZ4 + BLMAXSZ4 + AESSZ4];
+  const u32 shared_size_aes     = (5 * 1024);     // LOCAL_VK u32 s_te0[256];
+
  char *jit_build_options = NULL;

-  if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL)
+  if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
  {
-    u32 native_threads = 0;
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", 1);
+  }
+  else
+  {
+    u32 overhead = 0;

-    if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU)
+    if (device_param->opencl_device_vendor_id == VENDOR_ID_NV)
    {
-      native_threads = 1;
-    }
-    else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
-    {
-      #if defined (__APPLE__)
+      // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
+      // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
+      // on my development system. no clue where the 4 bytes are spent.
+      // I did some research on this and it seems to be related with the datatype.
+      // For example, if i used u8 instead, there's only 1 byte wasted.

-      native_threads = 32;
-
-      #else
-
-      if (device_param->device_local_mem_size < 49152)
+      if (device_param->is_opencl == true)
      {
-        native_threads = MIN (device_param->kernel_preferred_wgs_multiple, 32); // We can't just set 32, because Intel GPU need 8
+        overhead = 1;
      }
-      else
-      {
-        // to go over 48KiB, we need to use dynamic shared mem
-        native_threads = 49152 / 128;
-      }
-
-      #endif
    }

-    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", native_threads);
+    const u32 device_local_mem_size = MIN (device_param->device_local_mem_size, 48*1024);
+
+    u32 fixed_local_size = ((device_local_mem_size - overhead) - shared_size_aes) / shared_size_scratch;
+
+    if (user_options->kernel_threads_chgd == true)
+    {
+      fixed_local_size = user_options->kernel_threads;
+    }
+    else
+    {
+      if (fixed_local_size > device_param->kernel_preferred_wgs_multiple) fixed_local_size -= fixed_local_size % device_param->kernel_preferred_wgs_multiple;
+    }
+
+    hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u -D _unroll", fixed_local_size);
  }

  return jit_build_options;
--- a/src/modules/module_13711.c
+++ b/src/modules/module_13711.c
@ -129,6 +129,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@ -344,7 +351,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
--- a/src/modules/module_13712.c
+++ b/src/modules/module_13712.c
@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
--- a/src/modules/module_13713.c
+++ b/src/modules/module_13713.c
@ -146,6 +146,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@ -361,7 +368,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
--- a/src/modules/module_13721.c
+++ b/src/modules/module_13721.c
@ -131,6 +131,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@ -346,7 +353,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
--- a/src/modules/module_13722.c
+++ b/src/modules/module_13722.c
@ -148,6 +148,13 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
  return tmp_size;
 }

+u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
+{
+  const u32 kernel_loops_min = 250;
+
+  return kernel_loops_min;
+}
+
 u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
 {
  const u32 kernel_loops_max = 1000; // lowest PIM multiplier
@ -363,7 +370,7 @@ void module_init (module_ctx_t *module_ctx)
  module_ctx->module_kernel_accel_max         = MODULE_DEFAULT;
  module_ctx->module_kernel_accel_min         = MODULE_DEFAULT;
  module_ctx->module_kernel_loops_max         = module_kernel_loops_max;
-  module_ctx->module_kernel_loops_min         = MODULE_DEFAULT;
+  module_ctx->module_kernel_loops_min         = module_kernel_loops_min;
  module_ctx->module_kernel_threads_max       = MODULE_DEFAULT;
  module_ctx->module_kernel_threads_min       = MODULE_DEFAULT;
  module_ctx->module_kern_type                = module_kern_type;
--- a/Show More
+++ b/Show More