1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-07-19 13:08:19 +00:00

Add support for detecting unified GPU memory on CUDA and HIP (previously available only for OpenCL and Metal).

Do not adjust kernel-accel or scrypt-tmto for GPUs with unified memory, typically integrated GPUs in CPUs (APUs).
Redesign the "4-buffer" strategy to avoid overallocation from naive division by four, which can significantly increase memory usage for high scrypt configurations (e.g., 256k:8:1).
Update the scrypt B[] access pattern to match the new "4-buffer" design.
Allow user-specified kernel-accel and scrypt-tmto values, individually or both, via command line and tuning database. Any unspecified parameters are adjusted automatically.
Permit user-defined combinations of scrypt-tmto and kernel-accel even if they may exceed available memory.
This commit is contained in:
Jens Steube 2025-06-17 13:32:57 +02:00
parent 1ac14903d0
commit 4b93a6e93c
6 changed files with 262 additions and 128 deletions

View File

@ -3,7 +3,7 @@ for (int digest_pos = 0; digest_pos < DIGESTS_CNT; digest_pos++)
{
const u32 final_hash_pos = DIGESTS_OFFSET_HOST + digest_pos;
const digest_t *digest = digests_buf + final_hash_pos;
GLOBAL_AS const digest_t *digest = digests_buf + final_hash_pos;
const int invalid_bits = count_bits_32 (digest->digest_buf[0], r0)
+ count_bits_32 (digest->digest_buf[1], r1)

View File

@ -294,10 +294,12 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
const u32 zSIZE = STATE_CNT4;
const u32 x = (u32) gid;
const u64 bid = get_group_id(0);
const u64 lsz = get_local_size(0);
const u64 lid = get_local_id(0);
const u32 xd4 = x / 4;
const u32 xm4 = x & 3;
const u32 xd4 = bid / 4;
const u32 xm4 = bid & 3;
GLOBAL_AS uint4 *V;
@ -311,7 +313,7 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL
for (u32 y = 0; y < ySIZE; y++)
{
for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z];
for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X[z];
#ifdef IS_HIP
for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_l ((LOCAL_AS u32 *) X);
@ -330,10 +332,12 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO;
const u32 zSIZE = STATE_CNT4;
const u32 x = (u32) gid;
const u64 bid = get_group_id(0);
const u64 lsz = get_local_size(0);
const u64 lid = get_local_id(0);
const u32 xd4 = x / 4;
const u32 xm4 = x & 3;
const u32 xd4 = bid / 4;
const u32 xm4 = bid & 3;
GLOBAL_AS uint4 *V;
@ -347,7 +351,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
// note: max 2048 iterations = forced -u 2048
const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048;
const u32 N_max = (SCRYPT_N < 2048) ? SCRYPT_N : 2048;
for (u32 N_pos = 0; N_pos < N_max; N_pos++)
{
@ -357,7 +361,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL
const u32 km = k - (y << SCRYPT_TMTO);
for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO];
for (u32 z = 0; z < zSIZE; z++) T[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)];
#ifdef IS_HIP
for (u32 i = 0; i < km; i++) salsa_r_l ((LOCAL_AS u32 *) T);

View File

@ -15,8 +15,7 @@
#define STATE_CNT GET_STATE_CNT (SCRYPT_R)
#define STATE_CNT4 (STATE_CNT / 4)
#define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z))
#define CO Coord(xd4,y,z)
#define VIDX(bid4,lsz,lid,ySIZE,zSIZE,y,z) (((bid4) * (lsz) * (ySIZE) * (zSIZE)) + ((lid) * (ySIZE) * (zSIZE)) + ((y) * (zSIZE)) + (z))
#if defined IS_CUDA
inline __device__ uint4 operator & (const uint4 a, const u32 b) { return make_uint4 ((a.x & b ), (a.y & b ), (a.z & b ), (a.w & b )); }

View File

@ -104,6 +104,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
return atomicOr (p, val);
}
DECLSPEC size_t get_group_id (const u32 dimindx __attribute__((unused)))
{
return blockIdx.x;
}
DECLSPEC size_t get_global_id (const u32 dimindx __attribute__((unused)))
{
return (blockIdx.x * blockDim.x) + threadIdx.x;
@ -280,6 +285,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val)
return atomicOr (p, val);
}
DECLSPEC size_t get_group_id (const u32 dimindx __attribute__((unused)))
{
return blockIdx.x;
}
DECLSPEC size_t get_global_id (const u32 dimindx __attribute__((unused)))
{
return (blockIdx.x * blockDim.x) + threadIdx.x;

View File

@ -5305,6 +5305,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
hc_string_trim_trailing (device_name);
// unified memory
int device_host_unified_memory = 0;
if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_device) == -1)
{
device_param->skipped = true;
continue;
}
device_param->device_host_unified_memory = device_host_unified_memory;
// device_processors
int device_processors = 0;
@ -5735,6 +5748,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
hc_string_trim_trailing (device_name);
// unified memory
int device_host_unified_memory = 0;
if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, hipDeviceAttributeIntegrated, hip_device) == -1)
{
device_param->skipped = true;
continue;
}
device_param->device_host_unified_memory = device_host_unified_memory;
// device_processors
int device_processors = 0;
@ -9705,11 +9731,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
u32 _kernel_accel = 0;
tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
if (user_options->kernel_accel_chgd == true)
{
_kernel_accel = user_options->kernel_accel;
}
else
{
tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode);
if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel;
if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel;
}
const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel);
@ -10405,7 +10436,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
// extra buffer
u64 size_extra_buffer = 4;
u64 size_extra_buffer1 = 4;
u64 size_extra_buffer2 = 4;
u64 size_extra_buffer3 = 4;
u64 size_extra_buffer4 = 4;
if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT)
{
@ -10423,20 +10457,51 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
device_param->extra_buffer_size = extra_buffer_size;
// for the size we actually allocate we need to cheat a bit in order to make it more easy for plugin developer.
//
// we will divide this size by 4 to workaround opencl limitation.
// this collides with a theoretical scenario (like -n1 -T1) where there's only one workitem,
// because inside the kernel the target buffer is selected by workitem_id / 4.
// but the maximum size of the buffer would be only 1/4 of what is needed -> overflow.
//
// to workaround this we make sure that there's always a full buffer in each of the 4 allocated buffers available.
/**
* We use a "4-buffer" strategy for certain hash types (like scrypt)
* that require large scratch buffers per work-item.
*
* The kernel assigns each work-item to one of 4 sub-buffers using:
* buffer index = workitem_id % 4
*
* This means that each of the 4 sub-buffers must be large enough to hold
* all work-items that map to it. However, the total number of work-items
* is not always a multiple of 4. If we naively split the total buffer size
* evenly into 4 parts, the last chunk may be too small and cause buffer
* overflows for configurations where work-items spill into a partially sized chunk.
*
* Previous versions worked around this by over-allocating a full extra buffer,
* but this wasted gpu memory for large hashes like scrypt with high N.
*
* This improved logic computes the exact number of work-items assigned to
* each of the 4 chunks and sizes each chunk precisely:
*
* - The first 'leftover' chunks get one extra work-item to cover any remainder.
* - This guarantees each chunk is large enough for its assigned work-items.
*/
const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max;
const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_accel_max;
const u64 extra_buffer_size_one = extra_buffer_size / kernel_power_max;
const u64 extra_buffer_size_threads = extra_buffer_size / kernel_power_max;
size_extra_buffer = extra_buffer_size + (extra_buffer_size_one * 4);
const u64 workitems_per_chunk = kernel_power_max / 4;
const u64 base_chunk_size = workitems_per_chunk * extra_buffer_size_threads;
size_extra_buffer1 = base_chunk_size;
size_extra_buffer2 = base_chunk_size;
size_extra_buffer3 = base_chunk_size;
size_extra_buffer4 = base_chunk_size;
const u64 leftover = kernel_power_max % 4;
switch (leftover)
{
case 3: size_extra_buffer3 += extra_buffer_size_threads; // fall-through
case 2: size_extra_buffer2 += extra_buffer_size_threads; // fall-through
case 1: size_extra_buffer1 += extra_buffer_size_threads; // fall-through
case 0: break;
}
}
// kern type
@ -11343,7 +11408,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+ size_shown
+ size_salts
+ size_results
+ size_extra_buffer
+ size_extra_buffer1
+ size_extra_buffer2
+ size_extra_buffer3
+ size_extra_buffer4
+ size_st_digests
+ size_st_salts
+ size_st_esalts
@ -11380,10 +11448,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown, size_shown) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs, size_salts) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result, size_results) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf, size_extra_buffer1) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf, size_extra_buffer2) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf, size_extra_buffer3) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf, size_extra_buffer4) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf, size_st_salts) == -1) return -1;
if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param, size_kernel_params) == -1) return -1;
@ -11491,10 +11559,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_shown, size_shown) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_salt_bufs, size_salts) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_result, size_results) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf, size_extra_buffer / 4) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf, size_extra_buffer1) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf, size_extra_buffer2) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf, size_extra_buffer3) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf, size_extra_buffer4) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_digests_buf, size_st_digests) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf, size_st_salts) == -1) return -1;
if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param, size_kernel_params) == -1) return -1;
@ -11614,10 +11682,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
// shared
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_results, NULL, &device_param->metal_d_result) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra0_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra1_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra2_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra3_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer1, NULL, &device_param->metal_d_extra0_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer2, NULL, &device_param->metal_d_extra1_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer3, NULL, &device_param->metal_d_extra2_buf) == -1) return -1;
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer4, NULL, &device_param->metal_d_extra3_buf) == -1) return -1;
// gpu only
if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_st_digests, NULL, &device_param->metal_d_st_digests_buf) == -1) return -1;
@ -11712,10 +11780,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_shown, NULL, &device_param->opencl_d_digests_shown) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_salts, NULL, &device_param->opencl_d_salt_bufs) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_results, NULL, &device_param->opencl_d_result) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra0_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra1_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra2_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra3_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer1, NULL, &device_param->opencl_d_extra0_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer2, NULL, &device_param->opencl_d_extra1_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer3, NULL, &device_param->opencl_d_extra2_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer4, NULL, &device_param->opencl_d_extra3_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_digests, NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_salts, NULL, &device_param->opencl_d_st_salts_buf) == -1) return -1;
if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_kernel_params, NULL, &device_param->opencl_d_kernel_param) == -1) return -1;
@ -15734,7 +15802,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
if (size_rules > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_rules_c > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_salts > undocumented_single_allocation_apple) memory_limit_hit = 1;
if ((size_extra_buffer / 4) > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_extra_buffer1 > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_extra_buffer2 > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_extra_buffer3 > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_extra_buffer4 > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_shown > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_tm > undocumented_single_allocation_apple) memory_limit_hit = 1;
if (size_tmps > undocumented_single_allocation_apple) memory_limit_hit = 1;
@ -15769,7 +15840,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
+ size_rules
+ size_rules_c
+ size_salts
+ size_extra_buffer
+ size_extra_buffer1
+ size_extra_buffer2
+ size_extra_buffer3
+ size_extra_buffer4
+ size_shown
+ size_tm
+ size_tmps

View File

@ -46,22 +46,17 @@ u32 scrypt_exptected_threads (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE
return threads;
}
const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel)
const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user)
{
hc_device_param_t *device_param = &backend_ctx->devices_param[device_id];
// preprocess tmto in case user has overridden
// it's important to set to 0 otherwise so we can postprocess tmto in that case
tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0;
// we enforce the same configuration for all hashes, so the next lines should be fine
const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N == 0) ? hashes->st_salts_buf[0].scrypt_N : hashes->salts_buf[0].scrypt_N;
const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r == 0) ? hashes->st_salts_buf[0].scrypt_r : hashes->salts_buf[0].scrypt_r;
const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p;
const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)) >> tmto;
const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));
const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param));
int lines_sz = 4096;
@ -72,95 +67,144 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t
const u32 device_local_mem_size = device_param->device_local_mem_size;
const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4));
const u64 fixed_mem = (512 * 1024 * 1024); // some storage we need for pws[], tmps[], and others
const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - fixed_mem;
tmto = 0;
u32 kernel_accel_new = device_processors;
if (kernel_accel)
if (kernel_accel_user)
{
// from command line or tuning db has priority
kernel_accel_new = kernel_accel_user;
kernel_accel_new = user_options->kernel_accel;
}
else
{
// find a nice kernel_accel for gpus programmatically
// on cpus there's no need for over subscription with scrypt
if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
if (user_options->scrypt_tmto_chgd == true)
{
if ((size_per_accel * device_processors) > available_mem) // not enough memory
// in this branch the user can shoot themselves into the foot
tmto = user_options->scrypt_tmto;
}
else
{
// only option to save the user is to increase tmto
for (tmto = 0; tmto < 6; tmto++)
{
const float multi = (float) available_mem / size_per_accel;
const u64 size_per_accel_tmto = size_per_accel >> tmto;
if ((multi * 2) >= device_processors)
{
kernel_accel_new = multi * 2;
kernel_accel_new -= 2;
if ((size_per_accel_tmto * kernel_accel_new) > available_mem) continue; // not enough memory
if ((multi * 4) >= device_processors * 2)
{
kernel_accel_new = multi * 4;
kernel_accel_new -= 4;
}
}
else if ((multi * 4) >= device_processors)
{
kernel_accel_new = multi * 4;
kernel_accel_new -= 4;
}
// clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
if (kernel_accel_new > device_processors)
{
const u32 extra = kernel_accel_new % device_processors;
if (extra < (device_processors * 0.16))
{
kernel_accel_new -= extra;
}
}
}
else
{
u64 multi = available_mem / size_per_accel;
if (tmto == 0)
{
tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto
multi = available_mem / (size_per_accel >> tmto);
}
multi /= device_processors;
multi -= 4;
multi = MIN (16, multi);
kernel_accel_new = device_processors * multi; // should be safe because of tmto
break;
}
}
}
// fix tmto if user allows
if (tmto == 0)
else
{
const u32 tmto_start = 0;
const u32 tmto_stop = 5;
for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++)
if (user_options->scrypt_tmto_chgd == true)
{
// global memory check
if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue;
tmto = user_options->scrypt_tmto;
}
else
{
// This is the typical case and the main challenge: choosing the right TMTO value.
// Finding a consistently good algorithm is nearly impossible due to the many factors
// that influence performance. There is no clear rule of thumb.
//
// For example, consider the default scrypt configuration with N=16k and r=8.
//
// In one test with an NVIDIA mobile GPU with 16 GiB of memory (minus X), the device could
// use 28/58 processors. In theory, increasing the TMTO should increase
// performance, but in practice it had no effect at all.
//
// In another test with an NVIDIA discrete GPU with 11 GiB (minus X), the device initially
// used 19/68 processors. Increasing the TMTO to utilize all 68 processors
// did yield the expected performance improvement, matching the theory.
//
// However, with an AMD discrete GPU with 24 GiB (minus X), the optimal case used 46/48
// processors. Increasing the TMTO should have reduced performance, but
// instead it nearly doubled the speed?! This might be related to AMD GPUs performing
// best with a thread count of 64 instead of 32, but in practice, using 64 threads
// shows little difference compared to 32, suggesting that at a very low level,
// only 32 threads may actually be active.
//
// This algorithm is far from ideal. Fortunately, we have a tuning database,
// so users can find the best -n value for their specific setup, and a forced -n value
// allows to easily calculate the TMTO.
// also need local memory check because in kernel we have:
// LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; // 32 * 128 * r * p = 32KiB we're close if there's no TMTO
if (device_local_mem_size < (state_per_accel >> tmto_new)) continue;
if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU)
{
for (tmto = 0; tmto < 2; tmto++) // results in tmto = 2
{
if (device_param->device_host_unified_memory == 1) break; // do not touch
tmto = tmto_new;
if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD)
|| (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP))
{
if (tmto == 0) continue; // at least 1
}
break;
const u64 size_per_accel_tmto = size_per_accel >> tmto;
const float blocks = (float) available_mem / size_per_accel_tmto;
const float blocks_perc = device_processors / blocks;
if (blocks_perc > 1.16) continue;
// probably very low scrypt configuration = register pressure becomes a bottleneck
if ((blocks_perc * (1 << tmto)) < 0.4) continue;
break;
}
if (device_param->is_hip == true)
{
// we use some local memory to speed up things, so
// we need to make sure there's enough local memory available
u64 state_per_accel_tmto = state_per_accel >> tmto;
while (state_per_accel_tmto > device_local_mem_size)
{
tmto++;
state_per_accel_tmto = state_per_accel >> tmto;
}
}
}
}
// from here tmto is known, and we need to update kernel_accel
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false))
{
const u64 size_per_accel_tmto = size_per_accel >> tmto;
kernel_accel_new = available_mem / size_per_accel_tmto;
kernel_accel_new = MIN (kernel_accel_new, 1024); // max supported
// luxury option, clamp if we have twice the processors
if (kernel_accel_new > (device_processors * 2))
{
const u32 extra = kernel_accel_new % device_processors;
kernel_accel_new -= extra;
}
// clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090
if (kernel_accel_new > device_processors)
{
const u32 extra = kernel_accel_new % device_processors;
if (extra < (device_processors * 0.16))
{
kernel_accel_new -= extra;
}
}
}
}
@ -189,9 +233,11 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig
const u64 size_per_accel = 128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param);
u64 size_scrypt = size_per_accel * device_param->kernel_accel_max;
const u64 size_per_accel_tmto = size_per_accel >> tmto;
return size_scrypt / (1 << tmto);
const u64 size_scrypt = device_param->kernel_accel_max * size_per_accel_tmto;
return size_scrypt;
}
u64 scrypt_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -256,3 +302,4 @@ char *scrypt_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconf
return jit_build_options;
}