Fix funnelshift usage on AMD and NV platforms

While HIP doesn't have funnelshift emulation, it's better to use the native __builtin_amdgcn_alignbit() on AMD. For NVIDIA, we need to make sure the target matches a supported SM version and fall back to byte_perm() otherwise. Fix hash-mode 6900 in optimized mode and attack mode 3 This hash-mode doesn't use any stopbit, and if the password length is exactly 16 or 32, then hashcat selects the next higher kernel, e.g., s16 for password length 32. For such corner cases, we must copy s08 code to s16. It doesn't seem other algorithms are affected. Some have the s16 body left empty, but they have a password length limit < 32. Add test_edge* to .gitignore
2025-07-19 13:08:19 +00:00 · 2025-07-13 08:59:52 +02:00 · 2025-07-13 08:59:52 +02:00 · 725528058c
commit 725528058c
parent 4a6b538b43
5 changed files with 253 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,6 +29,7 @@ obj/*.a
 include/CL
 tools/luks_tests
 .vscode
+test_edge*

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@ -1781,6 +1781,7 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
  return r;
 }

+#ifdef USE_FUNNELSHIFT
 DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
 {
  const int c_mod_4 = c & 3;
@ -1788,35 +1789,35 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
  u32x r;

  #if VECT_SIZE == 1
-  r = __funnelshift_r (b, a, c_mod_4 * 8);
+  r = hc_funnelshift_r (b, a, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
-  r.se = __funnelshift_r (b.se, a.se, c_mod_4 * 8);
-  r.sf = __funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_r (b.se, a.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
  #endif

  return r;
@ -1826,7 +1827,7 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
 {
  const int c_mod_4 = c & 3;

-  const u32 r = __funnelshift_r (b, a, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_r (b, a, c_mod_4 * 8);

  return r;
 }
@ -1838,35 +1839,35 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
  u32x r;

  #if VECT_SIZE == 1
-  r = __funnelshift_l (a, b, c_mod_4 * 8);
+  r = hc_funnelshift_l (a, b, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
  #endif

  #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
-  r.se = __funnelshift_l (a.se, b.se, c_mod_4 * 8);
-  r.sf = __funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_l (a.se, b.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
  #endif

  return r;
@ -1876,10 +1877,51 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
 {
  const int c_mod_4 = c & 3;

-  const u32 r = __funnelshift_l (a, b, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_l (a, b, c_mod_4 * 8);

  return r;
 }
+#else
+DECLSPEC u32 hc_bytealign_be (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+#endif

 DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
 {
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@ -6,6 +6,7 @@
 #include "inc_vendor.h"
 #include "inc_types.h"
 #include "inc_platform.h"
+#include "inc_common.h"

 #ifdef IS_NATIVE
 #define FIXED_THREAD_COUNT(n)
@ -60,6 +61,33 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)

 #endif // IS_AMD

+// this applies to cuda and opencl
+#if defined IS_NV
+
+#ifdef USE_FUNNELSHIFT
+
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+#endif
+
+#endif // IS_NV
+
 #if defined IS_CUDA

 #if ATTACK_EXEC == 11
@ -243,7 +271,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
+  return hc_funnelshift_l (a, a, n);
  #else
  return ((a << n) | ((a >> (32 - n))));
  #endif
@ -252,7 +280,7 @@ DECLSPEC u32 rotl32_S (const u32 a, const int n)
 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
+  return hc_funnelshift_r (a, a, n);
  #else
  return ((a >> n) | ((a << (32 - n))));
  #endif
@ -472,29 +500,17 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)

 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
-  #else
-  return ((a << n) | ((a >> (32 - n))));
-  #endif
+  return rotr32_S (a, 32 - n);
 }

 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
-  #else
-  return ((a >> n) | ((a << (32 - n))));
-  #endif
+  return __builtin_amdgcn_alignbit (a, a, n);
 }

 DECLSPEC u64x rotl64 (const u64x a, const int n)
 {
-  #if VECT_SIZE == 1
-  return rotl64_S (a, n);
-  #else
-  return ((a << n) | ((a >> (64 - n))));
-  #endif
+  return rotr64 (a, 64 - n);
 }

 DECLSPEC u64x rotr64 (const u64x a, const int n)
@ -513,7 +529,6 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n)

 DECLSPEC u64 rotr64_S (const u64 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
  vconv64_t in;

  in.v64 = a;
@ -523,21 +538,20 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)

  vconv64_t out;

+  const int n31 = n & 31;
+
  if (n < 32)
  {
-    out.v32.a = __funnelshift_r (a0, a1, n);
-    out.v32.b = __funnelshift_r (a1, a0, n);
+    out.v32.a = __builtin_amdgcn_alignbit (a1, a0, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a0, a1, n31);
  }
  else
  {
-    out.v32.a = __funnelshift_r (a1, a0, n - 32);
-    out.v32.b = __funnelshift_r (a0, a1, n - 32);
+    out.v32.a = __builtin_amdgcn_alignbit (a0, a1, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a1, a0, n31);
  }

  return out.v64;
-  #else
-  return ((a >> n) | ((a << (64 - n))));
-  #endif
 }

 #define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@ -21,6 +21,11 @@ DECLSPEC u64  rotl64_S (const u64  a, const int n);
 DECLSPEC u64  rotr64_S (const u64  a, const int n);
 #endif // IS_AMD

+#ifdef IS_NV
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift);
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift);
+#endif // IS_NV
+
 #ifdef IS_CUDA
 DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
--- a/OpenCL/m06900_a3-optimized.cl
+++ b/OpenCL/m06900_a3-optimized.cl
@ -1190,6 +1190,71 @@ KERNEL_FQ KERNEL_FA void m06900_m08 (KERN_ATTR_BASIC ())

 KERNEL_FQ KERNEL_FA void m06900_m16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900m() doesn't support > 32, but we need a _m16 kernel because hashcat will call _m16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }

 KERNEL_FQ KERNEL_FA void m06900_s04 (KERN_ATTR_BASIC ())
@ -1332,4 +1397,69 @@ KERNEL_FQ KERNEL_FA void m06900_s08 (KERN_ATTR_BASIC ())

 KERNEL_FQ KERNEL_FA void m06900_s16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900s() doesn't support > 32, but we need a _s16 kernel because hashcat will call _s16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }