Removed ALIGN_PTR_1k() macro for SCRYPT. This has a major impact on NV GPUs performance, though the reason is unclear. Pages are already aligned, and the macro is called outside the main loop, so the impact should be minimal.

Always enabled funnelshift on NV GPUs, even on unsupported models, as it has a positive effect on performance.
2025-07-28 17:38:43 +00:00 · 2025-07-22 09:51:33 +02:00 · 2025-07-22 09:51:33 +02:00 · e90b6c8e53
commit e90b6c8e53
parent af5c824936
3 changed files with 12 additions and 11 deletions
--- a/OpenCL/inc_hash_scrypt.cl
+++ b/OpenCL/inc_hash_scrypt.cl
@ -162,10 +162,10 @@ DECLSPEC void scrypt_smix_init (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, GLOBAL_AS v

  switch (xm4)
  {
-    case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break;
-    case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break;
-    case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break;
-    case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break;
+    case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break;
+    case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break;
+    case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break;
+    case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break;
  }

  GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE);
@ -206,10 +206,10 @@ DECLSPEC void scrypt_smix_loop (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, PRIVATE_AS

  switch (xm4)
  {
-    case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break;
-    case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break;
-    case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break;
-    case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break;
+    case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break;
+    case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break;
+    case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break;
+    case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break;
  }

  GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE);
--- a/OpenCL/inc_hash_scrypt.h
+++ b/OpenCL/inc_hash_scrypt.h
@ -28,7 +28,8 @@
 // should be safe, because in backend.c we use:
 //    u64 size_extra_buffer1 = 4096;
 //  size_extra_buffer1 += base_chunk_size;
-#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL))
+// could be useless, pointers seem to be page aligned
+//#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL))

 #if defined IS_INTEL_SDK

--- a/src/backend.c
+++ b/src/backend.c
@ -5977,7 +5977,7 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
      device_param->has_lop3  = (sm >= 50) ? true : false;
      device_param->has_mov64 = (sm >= 10) ? true : false;
      device_param->has_prmt  = (sm >= 20) ? true : false;
-      device_param->has_shfw  = (sm >= 70) ? true : false;
+      device_param->has_shfw  = (sm >= 70) ? true : true; // still faster

      // one-time init cuda context

@ -8120,7 +8120,7 @@ static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, int *vi
          device_param->has_lop3  = (sm >= 50) ? true : false;
          device_param->has_mov64 = (sm >= 10) ? true : false;
          device_param->has_prmt  = (sm >= 20) ? true : false;
-          device_param->has_shfw  = (sm >= 70) ? true : false;
+          device_param->has_shfw  = (sm >= 70) ? true : true; // still faster
        }

        // common driver check