1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-24 15:38:47 +00:00

Increase default iteration count per kernel invocation from 1024 to 2048

Add support for lower iteration counts per kernel invocation than the default, enabling TMTO for low scrypt configurations, such as N=1024
Use TMTO 2 if it reaches 4 times the device processor count, instead of TMTO 1 always
Improve performance for low scrypt configurations (hash-mode 9300)
Fix unit test for 15700 with correct scrypt configurations
Disable CPU over subscription for SCRYPT based algorithms
This commit is contained in:
Jens Steube 2025-06-15 21:14:40 +02:00
parent 4d2485db0f
commit e134564a73
4 changed files with 55 additions and 45 deletions

View File

@ -345,9 +345,11 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
case 3: V = V3; break;
}
// note: fixed 1024 iterations = forced -u 1024
// note: max 2048 iterations = forced -u 2048
for (u32 N_pos = 0; N_pos < 1024; N_pos++)
const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
for (u32 N_pos = 0; N_pos < N_max; N_pos++)
{
const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1);

View File

@ -26,8 +26,20 @@ inline __device__ uint4 operator + (const uint4 a, const uint4 b) { return mak
inline __device__ uint4 operator ^ (const uint4 a, const uint4 b) { return make_uint4 ((a.x ^ b.x), (a.y ^ b.y), (a.z ^ b.z), (a.w ^ b.w)); }
inline __device__ uint4 operator | (const uint4 a, const uint4 b) { return make_uint4 ((a.x | b.x), (a.y | b.y), (a.z | b.z), (a.w | b.w)); }
inline __device__ void operator ^= ( uint4 &a, const uint4 b) { a.x ^= b.x; a.y ^= b.y; a.z ^= b.z; a.w ^= b.w; }
#endif
inline __device__ uint4 rotate (const uint4 a, const int n) { return ((a << n) | ((a >> (32 - n)))); }
#if defined IS_CUDA || defined IS_HIP
inline __device__ uint4 rotate (const uint4 a, const int n)
{
uint4 r;
r.x = hc_rotl32_S (r.x, n);
r.y = hc_rotl32_S (r.y, n);
r.z = hc_rotl32_S (r.z, n);
r.w = hc_rotl32_S (r.w, n);
return r;
}
#endif
#endif

View File

@ -10,14 +10,14 @@
u32 scrypt_module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_loops_min = 1024;
const u32 kernel_loops_min = 2048;
return kernel_loops_min;
}
u32 scrypt_module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_loops_max = 1024;
const u32 kernel_loops_max = 2048;
return kernel_loops_max;
}
@ -84,7 +84,8 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
}
else
{
// find a nice kernel_accel programmatically
// find a nice kernel_accel for gpus programmatically
// on cpus there's no need for over subscription with scrypt
if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{
@ -92,45 +93,51 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
{
const float multi = (float) available_mem / size_per_accel;
int accel_multi;
for (accel_multi = 1; accel_multi <= 2; accel_multi++)
if ((multi * 2) >= device_processors)
{
kernel_accel_new = multi * (1 << accel_multi);
kernel_accel_new = multi * 2;
kernel_accel_new -= 2;
if (kernel_accel_new >= device_processors) break;
if ((multi * 4) >= device_processors * 2)
{
kernel_accel_new = multi * 4;
kernel_accel_new -= 4;
}
}
else if ((multi * 4) >= device_processors)
{
kernel_accel_new = multi * 4;
kernel_accel_new -= 4;
}
// we need some space for tmps[], ...
kernel_accel_new -= (1 << accel_multi);
// clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
if ((kernel_accel_new > device_processors) && ((device_processors * 1.16) > kernel_accel_new))
if (kernel_accel_new > device_processors)
{
kernel_accel_new = device_processors;
const u32 extra = kernel_accel_new % device_processors;
if (extra < (device_processors * 0.16))
{
kernel_accel_new -= extra;
}
}
}
else
{
for (int i = 1; i <= 8; i++)
u64 multi = available_mem / size_per_accel;
if (tmto == 0)
{
if ((size_per_accel * device_processors * i) < available_mem)
{
kernel_accel_new = device_processors * i;
}
}
}
}
else
{
for (int i = 1; i <= 8; i++)
{
if ((size_per_accel * device_processors * i) < available_mem)
{
kernel_accel_new = device_processors * i;
tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
multi = available_mem / (size_per_accel >> tmto);
}
multi /= device_processors;
multi -= 4;
multi = MIN (16, multi);
kernel_accel_new = device_processors * multi; // should be safe because of tmto
}
}
}
@ -144,9 +151,6 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
{
// we have 1024 hard-coded in the kernel
if ((scrypt_N / (1 << tmto_new)) < 1024) continue;
// global memory check
if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;
@ -187,14 +191,6 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig
u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
// We must maintain at least 1024 iteration it's hard-coded in the kernel
if ((scrypt_N / (1 << tmto)) < 1024)
{
fprintf (stderr, "ERROR: SCRYPT-N parameter too low. Invalid tmto specified?\n");
return -1;
}
return size_scrypt / (1 << tmto);
}

View File

@ -17,8 +17,8 @@ sub module_generate_hash
{
my $word = shift;
my $salt = shift;
my $scrypt_N = shift || 1024 ; # 262144 originally
my $scrypt_r = shift || 1; # 8 originally
my $scrypt_N = shift || 262144;
my $scrypt_r = shift || 8;
my $scrypt_p = shift || 1;
my $ciphertext = shift || random_bytes (32);