diff --git a/OpenCL/inc_comp_multi_iba.cl b/OpenCL/inc_comp_multi_iba.cl index 9a41cc17c..05d8bf3b1 100644 --- a/OpenCL/inc_comp_multi_iba.cl +++ b/OpenCL/inc_comp_multi_iba.cl @@ -3,7 +3,7 @@ for (int digest_pos = 0; digest_pos < DIGESTS_CNT; digest_pos++) { const u32 final_hash_pos = DIGESTS_OFFSET_HOST + digest_pos; - const digest_t *digest = digests_buf + final_hash_pos; + GLOBAL_AS const digest_t *digest = digests_buf + final_hash_pos; const int invalid_bits = count_bits_32 (digest->digest_buf[0], r0) + count_bits_32 (digest->digest_buf[1], r1) diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl index 9662e330c..6655e2191 100644 --- a/OpenCL/inc_hash_scrypt.cl +++ b/OpenCL/inc_hash_scrypt.cl @@ -294,10 +294,12 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO; const u32 zSIZE = STATE_CNT4; - const u32 x = (u32) gid; + const u64 bid = get_group_id(0); + const u64 lsz = get_local_size(0); + const u64 lid = get_local_id(0); - const u32 xd4 = x / 4; - const u32 xm4 = x & 3; + const u32 xd4 = bid / 4; + const u32 xm4 = bid & 3; GLOBAL_AS uint4 *V; @@ -311,7 +313,7 @@ DECLSPEC void scrypt_smix_init (PRIVATE_AS uint4 *X, GLOBAL_AS uint4 *V0, GLOBAL for (u32 y = 0; y < ySIZE; y++) { - for (u32 z = 0; z < zSIZE; z++) V[CO] = X[z]; + for (u32 z = 0; z < zSIZE; z++) V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)] = X[z]; #ifdef IS_HIP for (u32 i = 0; i < (1 << SCRYPT_TMTO); i++) salsa_r_l ((LOCAL_AS u32 *) X); @@ -330,10 +332,12 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL const u32 ySIZE = SCRYPT_N >> SCRYPT_TMTO; const u32 zSIZE = STATE_CNT4; - const u32 x = (u32) gid; + const u64 bid = get_group_id(0); + const u64 lsz = get_local_size(0); + const u64 lid = get_local_id(0); - const u32 xd4 = x / 4; - const u32 xm4 = x & 3; + const u32 xd4 = bid / 4; + const u32 xm4 = bid & 3; GLOBAL_AS uint4 *V; @@ -347,7 +351,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL // note: max 2048 iterations = forced -u 2048 - const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048; + const u32 N_max = (SCRYPT_N < 2048) ? SCRYPT_N : 2048; for (u32 N_pos = 0; N_pos < N_max; N_pos++) { @@ -357,7 +361,7 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL const u32 km = k - (y << SCRYPT_TMTO); - for (u32 z = 0; z < zSIZE; z++) T[z] = V[CO]; + for (u32 z = 0; z < zSIZE; z++) T[z] = V[VIDX(xd4, lsz, lid, ySIZE, zSIZE, y, z)]; #ifdef IS_HIP for (u32 i = 0; i < km; i++) salsa_r_l ((LOCAL_AS u32 *) T); diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h index a41b09def..6f3864cdd 100644 --- a/OpenCL/inc_hash_scrypt.h +++ b/OpenCL/inc_hash_scrypt.h @@ -15,8 +15,7 @@ #define STATE_CNT GET_STATE_CNT (SCRYPT_R) #define STATE_CNT4 (STATE_CNT / 4) -#define Coord(xd4,y,z) (((xd4) * ySIZE * zSIZE) + ((y) * zSIZE) + (z)) -#define CO Coord(xd4,y,z) +#define VIDX(bid4,lsz,lid,ySIZE,zSIZE,y,z) (((bid4) * (lsz) * (ySIZE) * (zSIZE)) + ((lid) * (ySIZE) * (zSIZE)) + ((y) * (zSIZE)) + (z)) #if defined IS_CUDA inline __device__ uint4 operator & (const uint4 a, const u32 b) { return make_uint4 ((a.x & b ), (a.y & b ), (a.z & b ), (a.w & b )); } diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 99d2cea13..1125236dd 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -104,6 +104,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val) return atomicOr (p, val); } +DECLSPEC size_t get_group_id (const u32 dimindx __attribute__((unused))) +{ + return blockIdx.x; +} + DECLSPEC size_t get_global_id (const u32 dimindx __attribute__((unused))) { return (blockIdx.x * blockDim.x) + threadIdx.x; @@ -280,6 +285,11 @@ DECLSPEC u32 hc_atomic_or (GLOBAL_AS u32 *p, volatile const u32 val) return atomicOr (p, val); } +DECLSPEC size_t get_group_id (const u32 dimindx __attribute__((unused))) +{ + return blockIdx.x; +} + DECLSPEC size_t get_global_id (const u32 dimindx __attribute__((unused))) { return (blockIdx.x * blockDim.x) + threadIdx.x; diff --git a/src/backend.c b/src/backend.c index 4a7e36be3..9b1f290a5 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5305,6 +5305,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) hc_string_trim_trailing (device_name); + // unified memory + + int device_host_unified_memory = 0; + + if (hc_cuDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_device) == -1) + { + device_param->skipped = true; + + continue; + } + + device_param->device_host_unified_memory = device_host_unified_memory; + // device_processors int device_processors = 0; @@ -5735,6 +5748,19 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) hc_string_trim_trailing (device_name); + // unified memory + + int device_host_unified_memory = 0; + + if (hc_hipDeviceGetAttribute (hashcat_ctx, &device_host_unified_memory, hipDeviceAttributeIntegrated, hip_device) == -1) + { + device_param->skipped = true; + + continue; + } + + device_param->device_host_unified_memory = device_host_unified_memory; + // device_processors int device_processors = 0; @@ -9705,11 +9731,16 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) u32 _kernel_accel = 0; - tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode); + if (user_options->kernel_accel_chgd == true) + { + _kernel_accel = user_options->kernel_accel; + } + else + { + tuning_db_entry_t *tuningdb_entry = tuning_db_search (hashcat_ctx, device_param->device_name, device_param->opencl_device_type, user_options->attack_mode, hashconfig->hash_mode); - if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel; - - if (user_options->kernel_accel_chgd == true) _kernel_accel = user_options->kernel_accel; + if (tuningdb_entry != NULL) _kernel_accel = tuningdb_entry->kernel_accel; + } const char *extra_tuningdb_block = module_ctx->module_extra_tuningdb_block (hashconfig, user_options, user_options_extra, backend_ctx, hashes, device_id, _kernel_accel); @@ -10405,7 +10436,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // extra buffer - u64 size_extra_buffer = 4; + u64 size_extra_buffer1 = 4; + u64 size_extra_buffer2 = 4; + u64 size_extra_buffer3 = 4; + u64 size_extra_buffer4 = 4; if (module_ctx->module_extra_buffer_size != MODULE_DEFAULT) { @@ -10423,20 +10457,51 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->extra_buffer_size = extra_buffer_size; - // for the size we actually allocate we need to cheat a bit in order to make it more easy for plugin developer. - // - // we will divide this size by 4 to workaround opencl limitation. - // this collides with a theoretical scenario (like -n1 -T1) where there's only one workitem, - // because inside the kernel the target buffer is selected by workitem_id / 4. - // but the maximum size of the buffer would be only 1/4 of what is needed -> overflow. - // - // to workaround this we make sure that there's always a full buffer in each of the 4 allocated buffers available. + /** + * We use a "4-buffer" strategy for certain hash types (like scrypt) + * that require large scratch buffers per work-item. + * + * The kernel assigns each work-item to one of 4 sub-buffers using: + * buffer index = workitem_id % 4 + * + * This means that each of the 4 sub-buffers must be large enough to hold + * all work-items that map to it. However, the total number of work-items + * is not always a multiple of 4. If we naively split the total buffer size + * evenly into 4 parts, the last chunk may be too small and cause buffer + * overflows for configurations where work-items spill into a partially sized chunk. + * + * Previous versions worked around this by over-allocating a full extra buffer, + * but this wasted gpu memory for large hashes like scrypt with high N. + * + * This improved logic computes the exact number of work-items assigned to + * each of the 4 chunks and sizes each chunk precisely: + * + * - The first 'leftover' chunks get one extra work-item to cover any remainder. + * - This guarantees each chunk is large enough for its assigned work-items. + */ - const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads_max * device_param->kernel_accel_max; + const u64 kernel_power_max = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_accel_max; - const u64 extra_buffer_size_one = extra_buffer_size / kernel_power_max; + const u64 extra_buffer_size_threads = extra_buffer_size / kernel_power_max; - size_extra_buffer = extra_buffer_size + (extra_buffer_size_one * 4); + const u64 workitems_per_chunk = kernel_power_max / 4; + + const u64 base_chunk_size = workitems_per_chunk * extra_buffer_size_threads; + + size_extra_buffer1 = base_chunk_size; + size_extra_buffer2 = base_chunk_size; + size_extra_buffer3 = base_chunk_size; + size_extra_buffer4 = base_chunk_size; + + const u64 leftover = kernel_power_max % 4; + + switch (leftover) + { + case 3: size_extra_buffer3 += extra_buffer_size_threads; // fall-through + case 2: size_extra_buffer2 += extra_buffer_size_threads; // fall-through + case 1: size_extra_buffer1 += extra_buffer_size_threads; // fall-through + case 0: break; + } } // kern type @@ -11343,7 +11408,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) + size_shown + size_salts + size_results - + size_extra_buffer + + size_extra_buffer1 + + size_extra_buffer2 + + size_extra_buffer3 + + size_extra_buffer4 + size_st_digests + size_st_salts + size_st_esalts @@ -11380,10 +11448,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_digests_shown, size_shown) == -1) return -1; if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_salt_bufs, size_salts) == -1) return -1; if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_result, size_results) == -1) return -1; - if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf, size_extra_buffer / 4) == -1) return -1; + if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra0_buf, size_extra_buffer1) == -1) return -1; + if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra1_buf, size_extra_buffer2) == -1) return -1; + if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra2_buf, size_extra_buffer3) == -1) return -1; + if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_extra3_buf, size_extra_buffer4) == -1) return -1; if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_digests_buf, size_st_digests) == -1) return -1; if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_st_salts_buf, size_st_salts) == -1) return -1; if (hc_cuMemAlloc (hashcat_ctx, &device_param->cuda_d_kernel_param, size_kernel_params) == -1) return -1; @@ -11491,10 +11559,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_digests_shown, size_shown) == -1) return -1; if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_salt_bufs, size_salts) == -1) return -1; if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_result, size_results) == -1) return -1; - if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf, size_extra_buffer / 4) == -1) return -1; - if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf, size_extra_buffer / 4) == -1) return -1; + if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra0_buf, size_extra_buffer1) == -1) return -1; + if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra1_buf, size_extra_buffer2) == -1) return -1; + if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra2_buf, size_extra_buffer3) == -1) return -1; + if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_extra3_buf, size_extra_buffer4) == -1) return -1; if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_digests_buf, size_st_digests) == -1) return -1; if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_st_salts_buf, size_st_salts) == -1) return -1; if (hc_hipMemAlloc (hashcat_ctx, &device_param->hip_d_kernel_param, size_kernel_params) == -1) return -1; @@ -11614,10 +11682,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // shared if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_results, NULL, &device_param->metal_d_result) == -1) return -1; - if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra0_buf) == -1) return -1; - if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra1_buf) == -1) return -1; - if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra2_buf) == -1) return -1; - if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer / 4, NULL, &device_param->metal_d_extra3_buf) == -1) return -1; + if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer1, NULL, &device_param->metal_d_extra0_buf) == -1) return -1; + if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer2, NULL, &device_param->metal_d_extra1_buf) == -1) return -1; + if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer3, NULL, &device_param->metal_d_extra2_buf) == -1) return -1; + if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_extra_buffer4, NULL, &device_param->metal_d_extra3_buf) == -1) return -1; // gpu only if (hc_mtlCreateBuffer (hashcat_ctx, device_param->metal_device, size_st_digests, NULL, &device_param->metal_d_st_digests_buf) == -1) return -1; @@ -11712,10 +11780,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_shown, NULL, &device_param->opencl_d_digests_shown) == -1) return -1; if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_salts, NULL, &device_param->opencl_d_salt_bufs) == -1) return -1; if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_results, NULL, &device_param->opencl_d_result) == -1) return -1; - if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra0_buf) == -1) return -1; - if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra1_buf) == -1) return -1; - if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra2_buf) == -1) return -1; - if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer / 4, NULL, &device_param->opencl_d_extra3_buf) == -1) return -1; + if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer1, NULL, &device_param->opencl_d_extra0_buf) == -1) return -1; + if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer2, NULL, &device_param->opencl_d_extra1_buf) == -1) return -1; + if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer3, NULL, &device_param->opencl_d_extra2_buf) == -1) return -1; + if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_WRITE, size_extra_buffer4, NULL, &device_param->opencl_d_extra3_buf) == -1) return -1; if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_digests, NULL, &device_param->opencl_d_st_digests_buf) == -1) return -1; if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_st_salts, NULL, &device_param->opencl_d_st_salts_buf) == -1) return -1; if (hc_clCreateBuffer (hashcat_ctx, device_param->opencl_context, CL_MEM_READ_ONLY, size_kernel_params, NULL, &device_param->opencl_d_kernel_param) == -1) return -1; @@ -15734,7 +15802,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (size_rules > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_rules_c > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_salts > undocumented_single_allocation_apple) memory_limit_hit = 1; - if ((size_extra_buffer / 4) > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer1 > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer2 > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer3 > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer4 > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_shown > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_tm > undocumented_single_allocation_apple) memory_limit_hit = 1; if (size_tmps > undocumented_single_allocation_apple) memory_limit_hit = 1; @@ -15769,7 +15840,10 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) + size_rules + size_rules_c + size_salts - + size_extra_buffer + + size_extra_buffer1 + + size_extra_buffer2 + + size_extra_buffer3 + + size_extra_buffer4 + size_shown + size_tm + size_tmps diff --git a/src/modules/scrypt_common.c b/src/modules/scrypt_common.c index 059f2fca8..971683ae5 100644 --- a/src/modules/scrypt_common.c +++ b/src/modules/scrypt_common.c @@ -46,22 +46,17 @@ u32 scrypt_exptected_threads (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE return threads; } -const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel) +const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, const backend_ctx_t *backend_ctx, MAYBE_UNUSED const hashes_t *hashes, const u32 device_id, const u32 kernel_accel_user) { hc_device_param_t *device_param = &backend_ctx->devices_param[device_id]; - // preprocess tmto in case user has overridden - // it's important to set to 0 otherwise so we can postprocess tmto in that case - - tmto = (user_options->scrypt_tmto_chgd == true) ? user_options->scrypt_tmto : 0; - // we enforce the same configuration for all hashes, so the next lines should be fine const u32 scrypt_N = (hashes->salts_buf[0].scrypt_N == 0) ? hashes->st_salts_buf[0].scrypt_N : hashes->salts_buf[0].scrypt_N; const u32 scrypt_r = (hashes->salts_buf[0].scrypt_r == 0) ? hashes->st_salts_buf[0].scrypt_r : hashes->salts_buf[0].scrypt_r; const u32 scrypt_p = (hashes->salts_buf[0].scrypt_p == 0) ? hashes->st_salts_buf[0].scrypt_p : hashes->salts_buf[0].scrypt_p; - const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)) >> tmto; + const u64 size_per_accel = (128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)); const u64 state_per_accel = (128ULL * scrypt_r * scrypt_p * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param)); int lines_sz = 4096; @@ -72,95 +67,144 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t const u32 device_local_mem_size = device_param->device_local_mem_size; - const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)); + const u64 fixed_mem = (512 * 1024 * 1024); // some storage we need for pws[], tmps[], and others + + const u64 available_mem = MIN (device_param->device_available_mem, (device_param->device_maxmem_alloc * 4)) - fixed_mem; + + tmto = 0; u32 kernel_accel_new = device_processors; - if (kernel_accel) + if (kernel_accel_user) { - // from command line or tuning db has priority + kernel_accel_new = kernel_accel_user; - kernel_accel_new = user_options->kernel_accel; - } - else - { - // find a nice kernel_accel for gpus programmatically - // on cpus there's no need for over subscription with scrypt - - if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) + if (user_options->scrypt_tmto_chgd == true) { - if ((size_per_accel * device_processors) > available_mem) // not enough memory + // in this branch the user can shoot themselves into the foot + + tmto = user_options->scrypt_tmto; + } + else + { + // only option to save the user is to increase tmto + + for (tmto = 0; tmto < 6; tmto++) { - const float multi = (float) available_mem / size_per_accel; + const u64 size_per_accel_tmto = size_per_accel >> tmto; - if ((multi * 2) >= device_processors) - { - kernel_accel_new = multi * 2; - kernel_accel_new -= 2; + if ((size_per_accel_tmto * kernel_accel_new) > available_mem) continue; // not enough memory - if ((multi * 4) >= device_processors * 2) - { - kernel_accel_new = multi * 4; - kernel_accel_new -= 4; - } - } - else if ((multi * 4) >= device_processors) - { - kernel_accel_new = multi * 4; - kernel_accel_new -= 4; - } - - // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090 - - if (kernel_accel_new > device_processors) - { - const u32 extra = kernel_accel_new % device_processors; - - if (extra < (device_processors * 0.16)) - { - kernel_accel_new -= extra; - } - } - } - else - { - u64 multi = available_mem / size_per_accel; - - if (tmto == 0) - { - tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto - - multi = available_mem / (size_per_accel >> tmto); - } - - multi /= device_processors; - multi -= 4; - multi = MIN (16, multi); - - kernel_accel_new = device_processors * multi; // should be safe because of tmto + break; } } } - - // fix tmto if user allows - - if (tmto == 0) + else { - const u32 tmto_start = 0; - const u32 tmto_stop = 5; - - for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++) + if (user_options->scrypt_tmto_chgd == true) { - // global memory check - if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue; + tmto = user_options->scrypt_tmto; + } + else + { + // This is the typical case and the main challenge: choosing the right TMTO value. + // Finding a consistently good algorithm is nearly impossible due to the many factors + // that influence performance. There is no clear rule of thumb. + // + // For example, consider the default scrypt configuration with N=16k and r=8. + // + // In one test with an NVIDIA mobile GPU with 16 GiB of memory (minus X), the device could + // use 28/58 processors. In theory, increasing the TMTO should increase + // performance, but in practice it had no effect at all. + // + // In another test with an NVIDIA discrete GPU with 11 GiB (minus X), the device initially + // used 19/68 processors. Increasing the TMTO to utilize all 68 processors + // did yield the expected performance improvement, matching the theory. + // + // However, with an AMD discrete GPU with 24 GiB (minus X), the optimal case used 46/48 + // processors. Increasing the TMTO should have reduced performance, but + // instead it nearly doubled the speed?! This might be related to AMD GPUs performing + // best with a thread count of 64 instead of 32, but in practice, using 64 threads + // shows little difference compared to 32, suggesting that at a very low level, + // only 32 threads may actually be active. + // + // This algorithm is far from ideal. Fortunately, we have a tuning database, + // so users can find the best -n value for their specific setup, and a forced -n value + // allows to easily calculate the TMTO. - // also need local memory check because in kernel we have: - // LOCAL_VK uint4 T_s[MAX_THREADS_PER_BLOCK][STATE_CNT4]; // 32 * 128 * r * p = 32KiB we're close if there's no TMTO - if (device_local_mem_size < (state_per_accel >> tmto_new)) continue; + if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) + { + for (tmto = 0; tmto < 2; tmto++) // results in tmto = 2 + { + if (device_param->device_host_unified_memory == 1) break; // do not touch - tmto = tmto_new; + if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) + || (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP)) + { + if (tmto == 0) continue; // at least 1 + } - break; + const u64 size_per_accel_tmto = size_per_accel >> tmto; + + const float blocks = (float) available_mem / size_per_accel_tmto; + + const float blocks_perc = device_processors / blocks; + + if (blocks_perc > 1.16) continue; + + // probably very low scrypt configuration = register pressure becomes a bottleneck + if ((blocks_perc * (1 << tmto)) < 0.4) continue; + + break; + } + + if (device_param->is_hip == true) + { + // we use some local memory to speed up things, so + // we need to make sure there's enough local memory available + + u64 state_per_accel_tmto = state_per_accel >> tmto; + + while (state_per_accel_tmto > device_local_mem_size) + { + tmto++; + + state_per_accel_tmto = state_per_accel >> tmto; + } + } + } + } + + // from here tmto is known, and we need to update kernel_accel + + if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->device_host_unified_memory == false)) + { + const u64 size_per_accel_tmto = size_per_accel >> tmto; + + kernel_accel_new = available_mem / size_per_accel_tmto; + + kernel_accel_new = MIN (kernel_accel_new, 1024); // max supported + + // luxury option, clamp if we have twice the processors + + if (kernel_accel_new > (device_processors * 2)) + { + const u32 extra = kernel_accel_new % device_processors; + + kernel_accel_new -= extra; + } + + // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090 + + if (kernel_accel_new > device_processors) + { + const u32 extra = kernel_accel_new % device_processors; + + if (extra < (device_processors * 0.16)) + { + kernel_accel_new -= extra; + } + } } } @@ -189,9 +233,11 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig const u64 size_per_accel = 128ULL * scrypt_r * scrypt_N * scrypt_exptected_threads (hashconfig, user_options, user_options_extra, device_param); - u64 size_scrypt = size_per_accel * device_param->kernel_accel_max; + const u64 size_per_accel_tmto = size_per_accel >> tmto; - return size_scrypt / (1 << tmto); + const u64 size_scrypt = device_param->kernel_accel_max * size_per_accel_tmto; + + return size_scrypt; } u64 scrypt_module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) @@ -256,3 +302,4 @@ char *scrypt_module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconf return jit_build_options; } +