mirror of
https://github.com/hashcat/hashcat.git
synced 2025-07-28 17:38:43 +00:00
Removed ALIGN_PTR_1k() macro for SCRYPT. This has a major impact on NV GPUs performance, though the reason is unclear. Pages are already aligned, and the macro is called outside the main loop, so the impact should be minimal.
Always enabled funnelshift on NV GPUs, even on unsupported models, as it has a positive effect on performance.
This commit is contained in:
parent
af5c824936
commit
e90b6c8e53
@ -162,10 +162,10 @@ DECLSPEC void scrypt_smix_init (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, GLOBAL_AS v
|
||||
|
||||
switch (xm4)
|
||||
{
|
||||
case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break;
|
||||
case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break;
|
||||
case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break;
|
||||
case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break;
|
||||
case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break;
|
||||
case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break;
|
||||
case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break;
|
||||
case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break;
|
||||
}
|
||||
|
||||
GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE);
|
||||
@ -206,10 +206,10 @@ DECLSPEC void scrypt_smix_loop (GLOBAL_AS u32 *P, PRIVATE_AS u32 *X, PRIVATE_AS
|
||||
|
||||
switch (xm4)
|
||||
{
|
||||
case 0: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V0); break;
|
||||
case 1: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V1); break;
|
||||
case 2: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V2); break;
|
||||
case 3: V = (GLOBAL_AS hc_uint4_t *) ALIGN_PTR_1k (V3); break;
|
||||
case 0: V = (GLOBAL_AS hc_uint4_t *) V0; break;
|
||||
case 1: V = (GLOBAL_AS hc_uint4_t *) V1; break;
|
||||
case 2: V = (GLOBAL_AS hc_uint4_t *) V2; break;
|
||||
case 3: V = (GLOBAL_AS hc_uint4_t *) V3; break;
|
||||
}
|
||||
|
||||
GLOBAL_AS hc_uint4_t *Vx = V + (xd4 * lsz * ySIZE * zSIZE) + (lid * ySIZE * zSIZE);
|
||||
|
@ -28,7 +28,8 @@
|
||||
// should be safe, because in backend.c we use:
|
||||
// u64 size_extra_buffer1 = 4096;
|
||||
// size_extra_buffer1 += base_chunk_size;
|
||||
#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL))
|
||||
// could be useless, pointers seem to be page aligned
|
||||
//#define ALIGN_PTR_1k(p) ((GLOBAL_AS hc_uint4_t *) (((u64) (p) + 1023) & ~1023UL))
|
||||
|
||||
#if defined IS_INTEL_SDK
|
||||
|
||||
|
@ -5977,7 +5977,7 @@ static void backend_ctx_devices_init_cuda (hashcat_ctx_t *hashcat_ctx, int *virt
|
||||
device_param->has_lop3 = (sm >= 50) ? true : false;
|
||||
device_param->has_mov64 = (sm >= 10) ? true : false;
|
||||
device_param->has_prmt = (sm >= 20) ? true : false;
|
||||
device_param->has_shfw = (sm >= 70) ? true : false;
|
||||
device_param->has_shfw = (sm >= 70) ? true : true; // still faster
|
||||
|
||||
// one-time init cuda context
|
||||
|
||||
@ -8120,7 +8120,7 @@ static void backend_ctx_devices_init_opencl (hashcat_ctx_t *hashcat_ctx, int *vi
|
||||
device_param->has_lop3 = (sm >= 50) ? true : false;
|
||||
device_param->has_mov64 = (sm >= 10) ? true : false;
|
||||
device_param->has_prmt = (sm >= 20) ? true : false;
|
||||
device_param->has_shfw = (sm >= 70) ? true : false;
|
||||
device_param->has_shfw = (sm >= 70) ? true : true; // still faster
|
||||
}
|
||||
|
||||
// common driver check
|
||||
|
Loading…
Reference in New Issue
Block a user