From e90b6c8e537dd2cda192144a6b3b25eb9ed38303 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 22 Jul 2025 09:51:33 +0200 Subject: [PATCH] Removed ALIGN_PTR_1k() macro for SCRYPT. This has a major impact on NV GPUs performance, though the reason is unclear. Pages are already aligned, and the macro is called outside the main loop, so the impact should be minimal. Always enabled funnelshift on NV GPUs, even on unsupported models, as it has a positive effect on performance. --- OpenCL/inc_hash_scrypt.cl | 16 ++++++++-------- OpenCL/inc_hash_scrypt.h | 3 ++- src/backend.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/OpenCL/inc_hash_scrypt.cl b/OpenCL/inc_hash_scrypt.cl index 91bd37a2a..a24523a82 100644 --- a/OpenCL/inc_hash_scrypt.cl +++ b/OpenCL/inc_hash_scrypt.cl @@ -162,10 +162,10 @@ DECLSPEC void scrypt_smix_init (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, GLOBAL_AS v switch (xm4) { - case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break; - case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break; - case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break; - case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break; + case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break; + case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break; + case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break; + case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break; } GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE); @@ -206,10 +206,10 @@ DECLSPEC void scrypt_smix_loop (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, PRIVATE_AS switch (xm4) { - case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break; - case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break; - case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break; - case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break; + case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break; + case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break; + case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break; + case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break; } GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE); diff --git a/OpenCL/inc_hash_scrypt.h b/OpenCL/inc_hash_scrypt.h index d14b2843b..2d19ff7ef 100644 --- a/OpenCL/inc_hash_scrypt.h +++ b/OpenCL/inc_hash_scrypt.h @@ -28,7 +28,8 @@ // should be safe, because in backend.c we use: // u64 size_extra_buffer1 = 4096; // size_extra_buffer1 += base_chunk_size; -#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL)) +// could be useless, pointers seem to be page aligned +//#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL)) #if defined IS_INTEL_SDK diff --git a/src/backend.c b/src/backend.c index 1d26df77c..50da5b7dd 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5977,7 +5977,7 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt device_param->has_lop3 = (sm >= 50) ? true : false; device_param->has_mov64 = (sm >= 10) ? true : false; device_param->has_prmt = (sm >= 20) ? true : false; - device_param->has_shfw = (sm >= 70) ? true : false; + device_param->has_shfw = (sm >= 70) ? true : true; // still faster // one-time init cuda context @@ -8120,7 +8120,7 @@ static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, int *vi device_param->has_lop3 = (sm >= 50) ? true : false; device_param->has_mov64 = (sm >= 10) ? true : false; device_param->has_prmt = (sm >= 20) ? true : false; - device_param->has_shfw = (sm >= 70) ? true : false; + device_param->has_shfw = (sm >= 70) ? true : true; // still faster } // common driver check