From 674ca7d88f919f8305af59dbfd918a436c0c9889 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Mon, 12 Jul 2021 11:27:05 +0200 Subject: [PATCH] Add GPU threads to kernel cache checksum because it has an influence on HIP offline compile options Add V_ALIGNBIT_B32 inline assembly wrapper because HIP does not provide amd_bitalign() --- OpenCL/inc_platform.cl | 39 ++++++++++++++++++++++++++++++++++++--- src/backend.c | 13 ++++++++----- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 1a63e7c3a..806a403e4 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -246,22 +246,55 @@ DECLSPEC u32 rotr32_S (const u32 a, const int n) DECLSPEC u64x rotl64 (const u64x a, const int n) { - return ((a << n) | ((a >> (64 - n)))); + return rotr64 (a, 64 - n); +} + +DECLSPEC u32 amd_bitalign_S (const u32 a, const u32 b, const int n) +{ + u32 r = 0; + + __asm__ ("V_ALIGNBIT_B32 %0, %1, %2, %3;" : "=v"(r): "v"(a), "v"(b), "v"(n)); + + return r; } DECLSPEC u64x rotr64 (const u64x a, const int n) { + #if VECT_SIZE == 1 + return rotr64_S (a, n); + #else return ((a >> n) | ((a << (64 - n)))); + #endif } DECLSPEC u64 rotl64_S (const u64 a, const int n) { - return ((a << n) | ((a >> (64 - n)))); + return rotr64_S (a, 64 - n); } DECLSPEC u64 rotr64_S (const u64 a, const int n) { - return ((a >> n) | ((a << (64 - n)))); + vconv64_t in; + + in.v64 = a; + + const u32 a0 = in.v32.a; + const u32 a1 = in.v32.b; + + vconv64_t out; + + if (n < 32) + { + out.v32.a = amd_bitalign_S (a1, a0, n); + out.v32.b = amd_bitalign_S (a0, a1, n); + } + else + { + out.v32.a = amd_bitalign_S (a0, a1, n - 32); + out.v32.b = amd_bitalign_S (a1, a0, n - 32); + } + + return out.v64; } #define FIXED_THREAD_COUNT(n) __launch_bounds__((n), 0) diff --git a/src/backend.c b/src/backend.c index 6620d3d42..02cbbc3dd 100644 --- a/src/backend.c +++ b/src/backend.c @@ -10498,8 +10498,9 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p // TODO HIP // no -offload-arch= aka --gpu-architecture because hiprtc gets native arch from hip_context - hiprtc_options[0] = "--gpu-max-threads-per-block=64"; - hiprtc_options[1] = ""; + hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%u", device_param->kernel_threads); + + hiprtc_options[1] = "-O3"; hiprtc_options[2] = ""; hiprtc_options[3] = ""; @@ -11588,7 +11589,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) char device_name_chksum_amp_mp[HCBUFSIZ_TINY] = { 0 }; - const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s", + const size_t dnclen_amp_mp = snprintf (device_name_chksum_amp_mp, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_driver_version, @@ -11596,7 +11597,8 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->opencl_platform_vendor_id, device_param->device_name, device_param->opencl_device_version, - device_param->opencl_driver_version); + device_param->opencl_driver_version, + device_param->kernel_threads); md5_ctx_t md5_ctx; @@ -11887,7 +11889,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) const u32 extra_value = (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) ? ATTACK_MODE_ASSOCIATION : ATTACK_MODE_NONE; - const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%u-%s", + const size_t dnclen = snprintf (device_name_chksum, HCBUFSIZ_TINY, "%d-%d-%d-%d-%u-%s-%s-%s-%d-%u-%d-%u-%s", backend_ctx->comptime, backend_ctx->cuda_driver_version, backend_ctx->hip_driver_version, @@ -11897,6 +11899,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->opencl_device_version, device_param->opencl_driver_version, device_param->vector_width, + device_param->kernel_threads, hashconfig->kern_type, extra_value, build_options_module_buf);