diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl index 6c59404a0..9662e330c 100644 --- a/OpenCL/inc_hash_scrypt.cl +++ b/OpenCL/inc_hash_scrypt.cl @@ -345,9 +345,11 @@ DECLSPEC void scrypt_smix_loop (PRIVATE_AS uint4 *X, PRIVATE_AS uint4 *T, GLOBAL case 3: V = V3; break; } - // note: fixed 1024 iterations = forced -u 1024 + // note: max 2048 iterations = forced -u 2048 - for (u32 N_pos = 0; N_pos < 1024; N_pos++) + const u32 N_max = (2048 > ySIZE) ? ySIZE : 2048; + + for (u32 N_pos = 0; N_pos < N_max; N_pos++) { const u32 k = X[zSIZE - 4].x & (SCRYPT_N - 1); diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h index 3ae019a6e..a41b09def 100644 --- a/OpenCL/inc_hash_scrypt.h +++ b/OpenCL/inc_hash_scrypt.h @@ -26,8 +26,20 @@ inline __device__ uint4 operator + (const uint4 a, const uint4 b) { return mak inline __device__ uint4 operator ^ (const uint4 a, const uint4 b) { return make_uint4 ((a.x ^ b.x), (a.y ^ b.y), (a.z ^ b.z), (a.w ^ b.w)); } inline __device__ uint4 operator | (const uint4 a, const uint4 b) { return make_uint4 ((a.x | b.x), (a.y | b.y), (a.z | b.z), (a.w | b.w)); } inline __device__ void operator ^= ( uint4 &a, const uint4 b) { a.x ^= b.x; a.y ^= b.y; a.z ^= b.z; a.w ^= b.w; } +#endif -inline __device__ uint4 rotate (const uint4 a, const int n) { return ((a << n) | ((a >> (32 - n)))); } +#if defined IS_CUDA || defined IS_HIP +inline __device__ uint4 rotate (const uint4 a, const int n) +{ + uint4 r; + + r.x = hc_rotl32_S (r.x, n); + r.y = hc_rotl32_S (r.y, n); + r.z = hc_rotl32_S (r.z, n); + r.w = hc_rotl32_S (r.w, n); + + return r; +} #endif #endif diff --git a/src/modules/scrypt_common.c b/src/modules/scrypt_common.c index baf811d46..059f2fca8 100644 --- a/src/modules/scrypt_common.c +++ b/src/modules/scrypt_common.c @@ -10,14 +10,14 @@ u32 scrypt_module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_min = 1024; + const u32 kernel_loops_min = 2048; return kernel_loops_min; } u32 scrypt_module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { - const u32 kernel_loops_max = 1024; + const u32 kernel_loops_max = 2048; return kernel_loops_max; } @@ -84,7 +84,8 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t } else { - // find a nice kernel_accel programmatically + // find a nice kernel_accel for gpus programmatically + // on cpus there's no need for over subscription with scrypt if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { @@ -92,45 +93,51 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t { const float multi = (float) available_mem / size_per_accel; - int accel_multi; - - for (accel_multi = 1; accel_multi <= 2; accel_multi++) + if ((multi * 2) >= device_processors) { - kernel_accel_new = multi * (1 << accel_multi); + kernel_accel_new = multi * 2; + kernel_accel_new -= 2; - if (kernel_accel_new >= device_processors) break; + if ((multi * 4) >= device_processors * 2) + { + kernel_accel_new = multi * 4; + kernel_accel_new -= 4; + } + } + else if ((multi * 4) >= device_processors) + { + kernel_accel_new = multi * 4; + kernel_accel_new -= 4; } - - // we need some space for tmps[], ... - - kernel_accel_new -= (1 << accel_multi); // clamp if close to device processors -- 16% seems fine on a 2080ti, and on a 4090 - if ((kernel_accel_new > device_processors) && ((device_processors * 1.16) > kernel_accel_new)) + if (kernel_accel_new > device_processors) { - kernel_accel_new = device_processors; + const u32 extra = kernel_accel_new % device_processors; + + if (extra < (device_processors * 0.16)) + { + kernel_accel_new -= extra; + } } } else { - for (int i = 1; i <= 8; i++) + u64 multi = available_mem / size_per_accel; + + if (tmto == 0) { - if ((size_per_accel * device_processors * i) < available_mem) - { - kernel_accel_new = device_processors * i; - } - } - } - } - else - { - for (int i = 1; i <= 8; i++) - { - if ((size_per_accel * device_processors * i) < available_mem) - { - kernel_accel_new = device_processors * i; + tmto = 2; // we radically assign tmto = 2, since most gpus seem to enjoy that tmto + + multi = available_mem / (size_per_accel >> tmto); } + + multi /= device_processors; + multi -= 4; + multi = MIN (16, multi); + + kernel_accel_new = device_processors * multi; // should be safe because of tmto } } } @@ -144,9 +151,6 @@ const char *scrypt_module_extra_tuningdb_block (MAYBE_UNUSED const hashconfig_t for (u32 tmto_new = tmto_start; tmto_new <= tmto_stop; tmto_new++) { - // we have 1024 hard-coded in the kernel - if ((scrypt_N / (1 << tmto_new)) < 1024) continue; - // global memory check if (available_mem < (kernel_accel_new * (size_per_accel >> tmto_new))) continue; @@ -187,14 +191,6 @@ u64 scrypt_module_extra_buffer_size (MAYBE_UNUSED const hashconfig_t *hashconfig u64 size_scrypt = size_per_accel * device_param->kernel_accel_max; - // We must maintain at least 1024 iteration it's hard-coded in the kernel - if ((scrypt_N / (1 << tmto)) < 1024) - { - fprintf (stderr, "ERROR: SCRYPT-N parameter too low. Invalid tmto specified?\n"); - - return -1; - } - return size_scrypt / (1 << tmto); } diff --git a/tools/test_modules/m15700.pm b/tools/test_modules/m15700.pm index 896ce66e5..abee738bf 100644 --- a/tools/test_modules/m15700.pm +++ b/tools/test_modules/m15700.pm @@ -17,8 +17,8 @@ sub module_generate_hash { my $word = shift; my $salt = shift; - my $scrypt_N = shift || 1024 ; # 262144 originally - my $scrypt_r = shift || 1; # 8 originally + my $scrypt_N = shift || 262144; + my $scrypt_r = shift || 8; my $scrypt_p = shift || 1; my $ciphertext = shift || random_bytes (32);