From 725528058c257b4a7d1faea16abaaf60c447d5f2 Mon Sep 17 00:00:00 2001
From: Jens Steube <jens.steube@gmail.com>
Date: Sun, 13 Jul 2025 08:59:52 +0200
Subject: [PATCH] Fix funnelshift usage on AMD and NV platforms

While HIP doesn't have funnelshift emulation, it's better to use the
native __builtin_amdgcn_alignbit() on AMD. For NVIDIA, we need to make
sure the target matches a supported SM version and fall back to
byte_perm() otherwise.

Fix hash-mode 6900 in optimized mode and attack mode 3

This hash-mode doesn't use any stopbit, and if the password length is
exactly 16 or 32, then hashcat selects the next higher kernel, e.g., s16
for password length 32. For such corner cases, we must copy s08 code to
s16. It doesn't seem other algorithms are affected. Some have the s16
body left empty, but they have a password length limit < 32.

Add test_edge* to .gitignore
---
 .gitignore                    |   1 +
 OpenCL/inc_common.cl          | 114 +++++++++++++++++++----------
 OpenCL/inc_platform.cl        |  64 ++++++++++-------
 OpenCL/inc_platform.h         |   5 ++
 OpenCL/m06900_a3-optimized.cl | 130 ++++++++++++++++++++++++++++++++++
 5 files changed, 253 insertions(+), 61 deletions(-)

diff --git a/.gitignore b/.gitignore
index b268eedfb..99a9615aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ obj/*.a
 include/CL
 tools/luks_tests
 .vscode
+test_edge*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index d029a2bf3..e0a9f28ed 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -1781,6 +1781,7 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
   return r;
 }
 
+#ifdef USE_FUNNELSHIFT
 DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
 {
   const int c_mod_4 = c & 3;
@@ -1788,35 +1789,35 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
   u32x r;
 
   #if VECT_SIZE == 1
-  r = __funnelshift_r (b, a, c_mod_4 * 8);
+  r = hc_funnelshift_r (b, a, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
-  r.se = __funnelshift_r (b.se, a.se, c_mod_4 * 8);
-  r.sf = __funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_r (b.se, a.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
   #endif
 
   return r;
@@ -1826,7 +1827,7 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
 {
   const int c_mod_4 = c & 3;
 
-  const u32 r = __funnelshift_r (b, a, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_r (b, a, c_mod_4 * 8);
 
   return r;
 }
@@ -1838,35 +1839,35 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
   u32x r;
 
   #if VECT_SIZE == 1
-  r = __funnelshift_l (a, b, c_mod_4 * 8);
+  r = hc_funnelshift_l (a, b, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
-  r.se = __funnelshift_l (a.se, b.se, c_mod_4 * 8);
-  r.sf = __funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_l (a.se, b.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
   #endif
 
   return r;
@@ -1876,10 +1877,51 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
 {
   const int c_mod_4 = c & 3;
 
-  const u32 r = __funnelshift_l (a, b, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_l (a, b, c_mod_4 * 8);
 
   return r;
 }
+#else
+DECLSPEC u32 hc_bytealign_be (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+#endif
 
 DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
 {
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 0a386f22e..284dbf02a 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -6,6 +6,7 @@
 #include "inc_vendor.h"
 #include "inc_types.h"
 #include "inc_platform.h"
+#include "inc_common.h"
 
 #ifdef IS_NATIVE
 #define FIXED_THREAD_COUNT(n)
@@ -60,6 +61,33 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
 
 #endif // IS_AMD
 
+// this applies to cuda and opencl
+#if defined IS_NV
+
+#ifdef USE_FUNNELSHIFT
+
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+#endif
+
+#endif // IS_NV
+
 #if defined IS_CUDA
 
 #if ATTACK_EXEC == 11
@@ -243,7 +271,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
   #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
+  return hc_funnelshift_l (a, a, n);
   #else
   return ((a << n) | ((a >> (32 - n))));
   #endif
@@ -252,7 +280,7 @@ DECLSPEC u32 rotl32_S (const u32 a, const int n)
 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
   #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
+  return hc_funnelshift_r (a, a, n);
   #else
   return ((a >> n) | ((a << (32 - n))));
   #endif
@@ -472,29 +500,17 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
 
 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
-  #else
-  return ((a << n) | ((a >> (32 - n))));
-  #endif
+  return rotr32_S (a, 32 - n);
 }
 
 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
-  #else
-  return ((a >> n) | ((a << (32 - n))));
-  #endif
+  return __builtin_amdgcn_alignbit (a, a, n);
 }
 
 DECLSPEC u64x rotl64 (const u64x a, const int n)
 {
-  #if VECT_SIZE == 1
-  return rotl64_S (a, n);
-  #else
-  return ((a << n) | ((a >> (64 - n))));
-  #endif
+  return rotr64 (a, 64 - n);
 }
 
 DECLSPEC u64x rotr64 (const u64x a, const int n)
@@ -513,7 +529,6 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n)
 
 DECLSPEC u64 rotr64_S (const u64 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
   vconv64_t in;
 
   in.v64 = a;
@@ -523,21 +538,20 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
 
   vconv64_t out;
 
+  const int n31 = n & 31;
+
   if (n < 32)
   {
-    out.v32.a = __funnelshift_r (a0, a1, n);
-    out.v32.b = __funnelshift_r (a1, a0, n);
+    out.v32.a = __builtin_amdgcn_alignbit (a1, a0, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a0, a1, n31);
   }
   else
   {
-    out.v32.a = __funnelshift_r (a1, a0, n - 32);
-    out.v32.b = __funnelshift_r (a0, a1, n - 32);
+    out.v32.a = __builtin_amdgcn_alignbit (a0, a1, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a1, a0, n31);
   }
 
   return out.v64;
-  #else
-  return ((a >> n) | ((a << (64 - n))));
-  #endif
 }
 
 #define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index 9729d4fad..800988056 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -21,6 +21,11 @@ DECLSPEC u64  rotl64_S (const u64  a, const int n);
 DECLSPEC u64  rotr64_S (const u64  a, const int n);
 #endif // IS_AMD
 
+#ifdef IS_NV
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift);
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift);
+#endif // IS_NV
+
 #ifdef IS_CUDA
 DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
diff --git a/OpenCL/m06900_a3-optimized.cl b/OpenCL/m06900_a3-optimized.cl
index d71302d55..064eb3e58 100644
--- a/OpenCL/m06900_a3-optimized.cl
+++ b/OpenCL/m06900_a3-optimized.cl
@@ -1190,6 +1190,71 @@ KERNEL_FQ KERNEL_FA void m06900_m08 (KERN_ATTR_BASIC ())
 
 KERNEL_FQ KERNEL_FA void m06900_m16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900m() doesn't support > 32, but we need a _m16 kernel because hashcat will call _m16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }
 
 KERNEL_FQ KERNEL_FA void m06900_s04 (KERN_ATTR_BASIC ())
@@ -1332,4 +1397,69 @@ KERNEL_FQ KERNEL_FA void m06900_s08 (KERN_ATTR_BASIC ())
 
 KERNEL_FQ KERNEL_FA void m06900_s16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900s() doesn't support > 32, but we need a _s16 kernel because hashcat will call _s16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }