diff --git a/.gitignore b/.gitignore
index b268eedfb..99a9615aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ obj/*.a
 include/CL
 tools/luks_tests
 .vscode
+test_edge*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl
index d029a2bf3..e0a9f28ed 100644
--- a/OpenCL/inc_common.cl
+++ b/OpenCL/inc_common.cl
@@ -1781,6 +1781,7 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
   return r;
 }
 
+#ifdef USE_FUNNELSHIFT
 DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
 {
   const int c_mod_4 = c & 3;
@@ -1788,35 +1789,35 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
   u32x r;
 
   #if VECT_SIZE == 1
-  r = __funnelshift_r (b, a, c_mod_4 * 8);
+  r = hc_funnelshift_r (b, a, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
-  r.se = __funnelshift_r (b.se, a.se, c_mod_4 * 8);
-  r.sf = __funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_r (b.se, a.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
   #endif
 
   return r;
@@ -1826,7 +1827,7 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
 {
   const int c_mod_4 = c & 3;
 
-  const u32 r = __funnelshift_r (b, a, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_r (b, a, c_mod_4 * 8);
 
   return r;
 }
@@ -1838,35 +1839,35 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
   u32x r;
 
   #if VECT_SIZE == 1
-  r = __funnelshift_l (a, b, c_mod_4 * 8);
+  r = hc_funnelshift_l (a, b, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 2
-  r.s0 = __funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
-  r.s1 = __funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
+  r.s0 = hc_funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
+  r.s1 = hc_funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 4
-  r.s2 = __funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
-  r.s3 = __funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
+  r.s2 = hc_funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
+  r.s3 = hc_funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 8
-  r.s4 = __funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
-  r.s5 = __funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
-  r.s6 = __funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
-  r.s7 = __funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
+  r.s4 = hc_funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
+  r.s5 = hc_funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
+  r.s6 = hc_funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
+  r.s7 = hc_funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
   #endif
 
   #if VECT_SIZE >= 16
-  r.s8 = __funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
-  r.s9 = __funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
-  r.sa = __funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
-  r.sb = __funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
-  r.sc = __funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
-  r.sd = __funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
-  r.se = __funnelshift_l (a.se, b.se, c_mod_4 * 8);
-  r.sf = __funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
+  r.s8 = hc_funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
+  r.s9 = hc_funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
+  r.sa = hc_funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
+  r.sb = hc_funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
+  r.sc = hc_funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
+  r.sd = hc_funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
+  r.se = hc_funnelshift_l (a.se, b.se, c_mod_4 * 8);
+  r.sf = hc_funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
   #endif
 
   return r;
@@ -1876,10 +1877,51 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
 {
   const int c_mod_4 = c & 3;
 
-  const u32 r = __funnelshift_l (a, b, c_mod_4 * 8);
+  const u32 r = hc_funnelshift_l (a, b, c_mod_4 * 8);
 
   return r;
 }
+#else
+DECLSPEC u32 hc_bytealign_be (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+
+DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
+{
+  const int c_mod_4 = c & 3;
+
+  const int c_minus_4 = 4 - c_mod_4;
+
+  const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
+
+  return r;
+}
+#endif
 
 DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
 {
diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl
index 0a386f22e..284dbf02a 100644
--- a/OpenCL/inc_platform.cl
+++ b/OpenCL/inc_platform.cl
@@ -6,6 +6,7 @@
 #include "inc_vendor.h"
 #include "inc_types.h"
 #include "inc_platform.h"
+#include "inc_common.h"
 
 #ifdef IS_NATIVE
 #define FIXED_THREAD_COUNT(n)
@@ -60,6 +61,33 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
 
 #endif // IS_AMD
 
+// this applies to cuda and opencl
+#if defined IS_NV
+
+#ifdef USE_FUNNELSHIFT
+
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift)
+{
+  u32 result;
+
+  asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
+
+  return result;
+}
+
+#endif
+
+#endif // IS_NV
+
 #if defined IS_CUDA
 
 #if ATTACK_EXEC == 11
@@ -243,7 +271,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
   #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
+  return hc_funnelshift_l (a, a, n);
   #else
   return ((a << n) | ((a >> (32 - n))));
   #endif
@@ -252,7 +280,7 @@ DECLSPEC u32 rotl32_S (const u32 a, const int n)
 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
   #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
+  return hc_funnelshift_r (a, a, n);
   #else
   return ((a >> n) | ((a << (32 - n))));
   #endif
@@ -472,29 +500,17 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
 
 DECLSPEC u32 rotl32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_l (a, a, n);
-  #else
-  return ((a << n) | ((a >> (32 - n))));
-  #endif
+  return rotr32_S (a, 32 - n);
 }
 
 DECLSPEC u32 rotr32_S (const u32 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
-  return __funnelshift_r (a, a, n);
-  #else
-  return ((a >> n) | ((a << (32 - n))));
-  #endif
+  return __builtin_amdgcn_alignbit (a, a, n);
 }
 
 DECLSPEC u64x rotl64 (const u64x a, const int n)
 {
-  #if VECT_SIZE == 1
-  return rotl64_S (a, n);
-  #else
-  return ((a << n) | ((a >> (64 - n))));
-  #endif
+  return rotr64 (a, 64 - n);
 }
 
 DECLSPEC u64x rotr64 (const u64x a, const int n)
@@ -513,7 +529,6 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n)
 
 DECLSPEC u64 rotr64_S (const u64 a, const int n)
 {
-  #ifdef USE_FUNNELSHIFT
   vconv64_t in;
 
   in.v64 = a;
@@ -523,21 +538,20 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
 
   vconv64_t out;
 
+  const int n31 = n & 31;
+
   if (n < 32)
   {
-    out.v32.a = __funnelshift_r (a0, a1, n);
-    out.v32.b = __funnelshift_r (a1, a0, n);
+    out.v32.a = __builtin_amdgcn_alignbit (a1, a0, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a0, a1, n31);
   }
   else
   {
-    out.v32.a = __funnelshift_r (a1, a0, n - 32);
-    out.v32.b = __funnelshift_r (a0, a1, n - 32);
+    out.v32.a = __builtin_amdgcn_alignbit (a0, a1, n31);
+    out.v32.b = __builtin_amdgcn_alignbit (a1, a0, n31);
   }
 
   return out.v64;
-  #else
-  return ((a >> n) | ((a << (64 - n))));
-  #endif
 }
 
 #define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)
diff --git a/OpenCL/inc_platform.h b/OpenCL/inc_platform.h
index 9729d4fad..800988056 100644
--- a/OpenCL/inc_platform.h
+++ b/OpenCL/inc_platform.h
@@ -21,6 +21,11 @@ DECLSPEC u64  rotl64_S (const u64  a, const int n);
 DECLSPEC u64  rotr64_S (const u64  a, const int n);
 #endif // IS_AMD
 
+#ifdef IS_NV
+DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift);
+DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift);
+#endif // IS_NV
+
 #ifdef IS_CUDA
 DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
 DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);
diff --git a/OpenCL/m06900_a3-optimized.cl b/OpenCL/m06900_a3-optimized.cl
index d71302d55..064eb3e58 100644
--- a/OpenCL/m06900_a3-optimized.cl
+++ b/OpenCL/m06900_a3-optimized.cl
@@ -1190,6 +1190,71 @@ KERNEL_FQ KERNEL_FA void m06900_m08 (KERN_ATTR_BASIC ())
 
 KERNEL_FQ KERNEL_FA void m06900_m16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900m() doesn't support > 32, but we need a _m16 kernel because hashcat will call _m16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }
 
 KERNEL_FQ KERNEL_FA void m06900_s04 (KERN_ATTR_BASIC ())
@@ -1332,4 +1397,69 @@ KERNEL_FQ KERNEL_FA void m06900_s08 (KERN_ATTR_BASIC ())
 
 KERNEL_FQ KERNEL_FA void m06900_s16 (KERN_ATTR_BASIC ())
 {
+  /**
+   * base
+   */
+
+  const u64 lid = get_local_id (0);
+  const u64 gid = get_global_id (0);
+  const u64 lsz = get_local_size (0);
+
+  /**
+   * sbox
+   */
+
+  LOCAL_VK u32 s_tables[4][256];
+
+  for (u32 i = lid; i < 256; i += lsz)
+  {
+    s_tables[0][i] = c_tables[0][i];
+    s_tables[1][i] = c_tables[1][i];
+    s_tables[2][i] = c_tables[2][i];
+    s_tables[3][i] = c_tables[3][i];
+  }
+
+  SYNC_THREADS ();
+
+  if (gid >= GID_CNT) return;
+
+  /**
+   * modifier
+   */
+
+  u32 w0[4];
+
+  w0[0] = pws[gid].i[ 0];
+  w0[1] = pws[gid].i[ 1];
+  w0[2] = pws[gid].i[ 2];
+  w0[3] = pws[gid].i[ 3];
+
+  u32 w1[4];
+
+  w1[0] = pws[gid].i[ 4];
+  w1[1] = pws[gid].i[ 5];
+  w1[2] = pws[gid].i[ 6];
+  w1[3] = pws[gid].i[ 7];
+
+  u32 w2[4]; // no change here, because m06900s() doesn't support > 32, but we need a _s16 kernel because hashcat will call _s16 if pw_len >= 32
+
+  w2[0] = 0;
+  w2[1] = 0;
+  w2[2] = 0;
+  w2[3] = 0;
+
+  u32 w3[4];
+
+  w3[0] = 0;
+  w3[1] = 0;
+  w3[2] = 0;
+  w3[3] = 0;
+
+  const u32 pw_len = pws[gid].pw_len & 63;
+
+  /**
+   * main
+   */
+
+  m06900s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
 }