1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-19 13:08:19 +00:00

Fix funnelshift usage on AMD and NV platforms

While HIP doesn't have funnelshift emulation, it's better to use the
native __builtin_amdgcn_alignbit() on AMD. For NVIDIA, we need to make
sure the target matches a supported SM version and fall back to
byte_perm() otherwise.

Fix hash-mode 6900 in optimized mode and attack mode 3

This hash-mode doesn't use any stopbit, and if the password length is
exactly 16 or 32, then hashcat selects the next higher kernel, e.g., s16
for password length 32. For such corner cases, we must copy s08 code to
s16. It doesn't seem other algorithms are affected. Some have the s16
body left empty, but they have a password length limit < 32.

Add test_edge* to .gitignore
This commit is contained in:
Jens Steube 2025-07-13 08:59:52 +02:00
parent 4a6b538b43
commit 725528058c
5 changed files with 253 additions and 61 deletions

1
.gitignore vendored
View File

@ -29,6 +29,7 @@ obj/*.a
include/CL
tools/luks_tests
.vscode
test_edge*
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -1781,6 +1781,7 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
return r;
}
#ifdef USE_FUNNELSHIFT
DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
{
const int c_mod_4 = c & 3;
@ -1788,35 +1789,35 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
u32x r;
#if VECT_SIZE == 1
r = __funnelshift_r (b, a, c_mod_4 * 8);
r = hc_funnelshift_r (b, a, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 2
r.s0 = __funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
r.s1 = __funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
r.s0 = hc_funnelshift_r (b.s0, a.s0, c_mod_4 * 8);
r.s1 = hc_funnelshift_r (b.s1, a.s1, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 4
r.s2 = __funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
r.s3 = __funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
r.s2 = hc_funnelshift_r (b.s2, a.s2, c_mod_4 * 8);
r.s3 = hc_funnelshift_r (b.s3, a.s3, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 8
r.s4 = __funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
r.s5 = __funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
r.s6 = __funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
r.s7 = __funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
r.s4 = hc_funnelshift_r (b.s4, a.s4, c_mod_4 * 8);
r.s5 = hc_funnelshift_r (b.s5, a.s5, c_mod_4 * 8);
r.s6 = hc_funnelshift_r (b.s6, a.s6, c_mod_4 * 8);
r.s7 = hc_funnelshift_r (b.s7, a.s7, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 16
r.s8 = __funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
r.s9 = __funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
r.sa = __funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
r.sb = __funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
r.sc = __funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
r.sd = __funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
r.se = __funnelshift_r (b.se, a.se, c_mod_4 * 8);
r.sf = __funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
r.s8 = hc_funnelshift_r (b.s8, a.s8, c_mod_4 * 8);
r.s9 = hc_funnelshift_r (b.s9, a.s9, c_mod_4 * 8);
r.sa = hc_funnelshift_r (b.sa, a.sa, c_mod_4 * 8);
r.sb = hc_funnelshift_r (b.sb, a.sb, c_mod_4 * 8);
r.sc = hc_funnelshift_r (b.sc, a.sc, c_mod_4 * 8);
r.sd = hc_funnelshift_r (b.sd, a.sd, c_mod_4 * 8);
r.se = hc_funnelshift_r (b.se, a.se, c_mod_4 * 8);
r.sf = hc_funnelshift_r (b.sf, a.sf, c_mod_4 * 8);
#endif
return r;
@ -1826,7 +1827,7 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const u32 r = __funnelshift_r (b, a, c_mod_4 * 8);
const u32 r = hc_funnelshift_r (b, a, c_mod_4 * 8);
return r;
}
@ -1838,35 +1839,35 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
u32x r;
#if VECT_SIZE == 1
r = __funnelshift_l (a, b, c_mod_4 * 8);
r = hc_funnelshift_l (a, b, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 2
r.s0 = __funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
r.s1 = __funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
r.s0 = hc_funnelshift_l (a.s0, b.s0, c_mod_4 * 8);
r.s1 = hc_funnelshift_l (a.s1, b.s1, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 4
r.s2 = __funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
r.s3 = __funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
r.s2 = hc_funnelshift_l (a.s2, b.s2, c_mod_4 * 8);
r.s3 = hc_funnelshift_l (a.s3, b.s3, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 8
r.s4 = __funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
r.s5 = __funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
r.s6 = __funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
r.s7 = __funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
r.s4 = hc_funnelshift_l (a.s4, b.s4, c_mod_4 * 8);
r.s5 = hc_funnelshift_l (a.s5, b.s5, c_mod_4 * 8);
r.s6 = hc_funnelshift_l (a.s6, b.s6, c_mod_4 * 8);
r.s7 = hc_funnelshift_l (a.s7, b.s7, c_mod_4 * 8);
#endif
#if VECT_SIZE >= 16
r.s8 = __funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
r.s9 = __funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
r.sa = __funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
r.sb = __funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
r.sc = __funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
r.sd = __funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
r.se = __funnelshift_l (a.se, b.se, c_mod_4 * 8);
r.sf = __funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
r.s8 = hc_funnelshift_l (a.s8, b.s8, c_mod_4 * 8);
r.s9 = hc_funnelshift_l (a.s9, b.s9, c_mod_4 * 8);
r.sa = hc_funnelshift_l (a.sa, b.sa, c_mod_4 * 8);
r.sb = hc_funnelshift_l (a.sb, b.sb, c_mod_4 * 8);
r.sc = hc_funnelshift_l (a.sc, b.sc, c_mod_4 * 8);
r.sd = hc_funnelshift_l (a.sd, b.sd, c_mod_4 * 8);
r.se = hc_funnelshift_l (a.se, b.se, c_mod_4 * 8);
r.sf = hc_funnelshift_l (a.sf, b.sf, c_mod_4 * 8);
#endif
return r;
@ -1876,10 +1877,51 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const u32 r = __funnelshift_l (a, b, c_mod_4 * 8);
const u32 r = hc_funnelshift_l (a, b, c_mod_4 * 8);
return r;
}
#else
DECLSPEC u32 hc_bytealign_be (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const u32 r = hc_byte_perm (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
return r;
}
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff);
return r;
}
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
const int c_mod_4 = c & 3;
const int c_minus_4 = 4 - c_mod_4;
const u32x r = hc_byte_perm (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
return r;
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
const int c_mod_4 = c & 3;
const int c_minus_4 = 4 - c_mod_4;
const u32 r = hc_byte_perm_S (a, b, (0x76543210 >> (c_minus_4 * 4)) & 0xffff);
return r;
}
#endif
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{

View File

@ -6,6 +6,7 @@
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.h"
#include "inc_common.h"
#ifdef IS_NATIVE
#define FIXED_THREAD_COUNT(n)
@ -60,6 +61,33 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
#endif // IS_AMD
// this applies to cuda and opencl
#if defined IS_NV
#ifdef USE_FUNNELSHIFT
DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift)
{
u32 result;
asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
return result;
}
DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift)
{
u32 result;
asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(lo), "r"(hi), "r"(shift));
return result;
}
#endif
#endif // IS_NV
#if defined IS_CUDA
#if ATTACK_EXEC == 11
@ -243,7 +271,7 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
DECLSPEC u32 rotl32_S (const u32 a, const int n)
{
#ifdef USE_FUNNELSHIFT
return __funnelshift_l (a, a, n);
return hc_funnelshift_l (a, a, n);
#else
return ((a << n) | ((a >> (32 - n))));
#endif
@ -252,7 +280,7 @@ DECLSPEC u32 rotl32_S (const u32 a, const int n)
DECLSPEC u32 rotr32_S (const u32 a, const int n)
{
#ifdef USE_FUNNELSHIFT
return __funnelshift_r (a, a, n);
return hc_funnelshift_r (a, a, n);
#else
return ((a >> n) | ((a << (32 - n))));
#endif
@ -472,29 +500,17 @@ DECLSPEC u32x rotr32 (const u32x a, const int n)
DECLSPEC u32 rotl32_S (const u32 a, const int n)
{
#ifdef USE_FUNNELSHIFT
return __funnelshift_l (a, a, n);
#else
return ((a << n) | ((a >> (32 - n))));
#endif
return rotr32_S (a, 32 - n);
}
DECLSPEC u32 rotr32_S (const u32 a, const int n)
{
#ifdef USE_FUNNELSHIFT
return __funnelshift_r (a, a, n);
#else
return ((a >> n) | ((a << (32 - n))));
#endif
return __builtin_amdgcn_alignbit (a, a, n);
}
DECLSPEC u64x rotl64 (const u64x a, const int n)
{
#if VECT_SIZE == 1
return rotl64_S (a, n);
#else
return ((a << n) | ((a >> (64 - n))));
#endif
return rotr64 (a, 64 - n);
}
DECLSPEC u64x rotr64 (const u64x a, const int n)
@ -513,7 +529,6 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n)
DECLSPEC u64 rotr64_S (const u64 a, const int n)
{
#ifdef USE_FUNNELSHIFT
vconv64_t in;
in.v64 = a;
@ -523,21 +538,20 @@ DECLSPEC u64 rotr64_S (const u64 a, const int n)
vconv64_t out;
const int n31 = n & 31;
if (n < 32)
{
out.v32.a = __funnelshift_r (a0, a1, n);
out.v32.b = __funnelshift_r (a1, a0, n);
out.v32.a = __builtin_amdgcn_alignbit (a1, a0, n31);
out.v32.b = __builtin_amdgcn_alignbit (a0, a1, n31);
}
else
{
out.v32.a = __funnelshift_r (a1, a0, n - 32);
out.v32.b = __funnelshift_r (a0, a1, n - 32);
out.v32.a = __builtin_amdgcn_alignbit (a0, a1, n31);
out.v32.b = __builtin_amdgcn_alignbit (a1, a0, n31);
}
return out.v64;
#else
return ((a >> n) | ((a << (64 - n))));
#endif
}
#define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0)

View File

@ -21,6 +21,11 @@ DECLSPEC u64 rotl64_S (const u64 a, const int n);
DECLSPEC u64 rotr64_S (const u64 a, const int n);
#endif // IS_AMD
#ifdef IS_NV
DECLSPEC u32 hc_funnelshift_l (const u32 lo, const u32 hi, const int shift);
DECLSPEC u32 hc_funnelshift_r (const u32 lo, const u32 hi, const int shift);
#endif // IS_NV
#ifdef IS_CUDA
DECLSPEC u32 hc_atomic_dec (volatile GLOBAL_AS u32 *p);
DECLSPEC u32 hc_atomic_inc (volatile GLOBAL_AS u32 *p);

View File

@ -1190,6 +1190,71 @@ KERNEL_FQ KERNEL_FA void m06900_m08 (KERN_ATTR_BASIC ())
KERNEL_FQ KERNEL_FA void m06900_m16 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
const u64 lsz = get_local_size (0);
/**
* sbox
*/
LOCAL_VK u32 s_tables[4][256];
for (u32 i = lid; i < 256; i += lsz)
{
s_tables[0][i] = c_tables[0][i];
s_tables[1][i] = c_tables[1][i];
s_tables[2][i] = c_tables[2][i];
s_tables[3][i] = c_tables[3][i];
}
SYNC_THREADS ();
if (gid >= GID_CNT) return;
/**
* modifier
*/
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4]; // no change here, because m06900m() doesn't support > 32, but we need a _m16 kernel because hashcat will call _m16 if pw_len >= 32
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m06900m (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
}
KERNEL_FQ KERNEL_FA void m06900_s04 (KERN_ATTR_BASIC ())
@ -1332,4 +1397,69 @@ KERNEL_FQ KERNEL_FA void m06900_s08 (KERN_ATTR_BASIC ())
KERNEL_FQ KERNEL_FA void m06900_s16 (KERN_ATTR_BASIC ())
{
/**
* base
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
const u64 lsz = get_local_size (0);
/**
* sbox
*/
LOCAL_VK u32 s_tables[4][256];
for (u32 i = lid; i < 256; i += lsz)
{
s_tables[0][i] = c_tables[0][i];
s_tables[1][i] = c_tables[1][i];
s_tables[2][i] = c_tables[2][i];
s_tables[3][i] = c_tables[3][i];
}
SYNC_THREADS ();
if (gid >= GID_CNT) return;
/**
* modifier
*/
u32 w0[4];
w0[0] = pws[gid].i[ 0];
w0[1] = pws[gid].i[ 1];
w0[2] = pws[gid].i[ 2];
w0[3] = pws[gid].i[ 3];
u32 w1[4];
w1[0] = pws[gid].i[ 4];
w1[1] = pws[gid].i[ 5];
w1[2] = pws[gid].i[ 6];
w1[3] = pws[gid].i[ 7];
u32 w2[4]; // no change here, because m06900s() doesn't support > 32, but we need a _s16 kernel because hashcat will call _s16 if pw_len >= 32
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
u32 w3[4];
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
const u32 pw_len = pws[gid].pw_len & 63;
/**
* main
*/
m06900s (w0, w1, w2, w3, pw_len, pws, rules_buf, combs_buf, bfs_buf, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz, s_tables);
}