diff --git a/OpenCL/inc_platform.cl b/OpenCL/inc_platform.cl index 3f1336f92..8ccb034aa 100644 --- a/OpenCL/inc_platform.cl +++ b/OpenCL/inc_platform.cl @@ -253,7 +253,7 @@ DECLSPEC u32 amd_bitalign_S (const u32 a, const u32 b, const int n) { u32 r = 0; - asm ("V_ALIGNBIT_B32 %0, %1, %2, %3;" : "=v"(r): "v"(a), "v"(b), "I"(n)); + __asm__ ("V_ALIGNBIT_B32 %0, %1, %2, %3;" : "=v"(r): "v"(a), "v"(b), "I"(n)); return r; } diff --git a/OpenCL/inc_vendor.h b/OpenCL/inc_vendor.h index a94bbefd4..f4c31f59a 100644 --- a/OpenCL/inc_vendor.h +++ b/OpenCL/inc_vendor.h @@ -95,8 +95,6 @@ #define IS_GENERIC #elif VENDOR_ID == (1 << 8) #define IS_AMD_USE_HIP -// TODO HIP optimization potential -//#define IS_GENERIC #else #define IS_GENERIC #endif @@ -158,10 +156,8 @@ #endif #ifdef IS_HIP -//TODO HIP -//#define USE_BITSELECT -//#define USE_ROTATE -//#define USE_SWIZZLE +#define USE_BITSELECT +#define USE_ROTATE #endif #ifdef IS_ROCM diff --git a/OpenCL/m01700_a0-optimized.cl b/OpenCL/m01700_a0-optimized.cl index 18c7c61ab..dcc4c217e 100644 --- a/OpenCL/m01700_a0-optimized.cl +++ b/OpenCL/m01700_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); -#if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01700_a1-optimized.cl b/OpenCL/m01700_a1-optimized.cl index 21efdcc46..6ca96c818 100644 --- a/OpenCL/m01700_a1-optimized.cl +++ b/OpenCL/m01700_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01700_a3-optimized.cl b/OpenCL/m01700_a3-optimized.cl index 064044263..6444cfae0 100644 --- a/OpenCL/m01700_a3-optimized.cl +++ b/OpenCL/m01700_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01710_a0-optimized.cl b/OpenCL/m01710_a0-optimized.cl index 4b66b83f5..2c72b062f 100644 --- a/OpenCL/m01710_a0-optimized.cl +++ b/OpenCL/m01710_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01710_a1-optimized.cl b/OpenCL/m01710_a1-optimized.cl index e7b691334..45111549b 100644 --- a/OpenCL/m01710_a1-optimized.cl +++ b/OpenCL/m01710_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01710_a3-optimized.cl b/OpenCL/m01710_a3-optimized.cl index 1e893c967..76a331f7e 100644 --- a/OpenCL/m01710_a3-optimized.cl +++ b/OpenCL/m01710_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01720_a0-optimized.cl b/OpenCL/m01720_a0-optimized.cl index 6def5fff2..bfb0bb37e 100644 --- a/OpenCL/m01720_a0-optimized.cl +++ b/OpenCL/m01720_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01720_a1-optimized.cl b/OpenCL/m01720_a1-optimized.cl index ffe6fe15a..ade402112 100644 --- a/OpenCL/m01720_a1-optimized.cl +++ b/OpenCL/m01720_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01720_a3-optimized.cl b/OpenCL/m01720_a3-optimized.cl index 3fdf675e0..5dfc2b9d2 100644 --- a/OpenCL/m01720_a3-optimized.cl +++ b/OpenCL/m01720_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01730_a0-optimized.cl b/OpenCL/m01730_a0-optimized.cl index 45c025215..14a965c4b 100644 --- a/OpenCL/m01730_a0-optimized.cl +++ b/OpenCL/m01730_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01730_a1-optimized.cl b/OpenCL/m01730_a1-optimized.cl index 03b3e10af..0fe9c945d 100644 --- a/OpenCL/m01730_a1-optimized.cl +++ b/OpenCL/m01730_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01730_a3-optimized.cl b/OpenCL/m01730_a3-optimized.cl index b114b8c18..e533b3e95 100644 --- a/OpenCL/m01730_a3-optimized.cl +++ b/OpenCL/m01730_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01740_a0-optimized.cl b/OpenCL/m01740_a0-optimized.cl index b04db3c82..b9be203f5 100644 --- a/OpenCL/m01740_a0-optimized.cl +++ b/OpenCL/m01740_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01740_a1-optimized.cl b/OpenCL/m01740_a1-optimized.cl index 2fdb41cde..599cdcbad 100644 --- a/OpenCL/m01740_a1-optimized.cl +++ b/OpenCL/m01740_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m01740_a3-optimized.cl b/OpenCL/m01740_a3-optimized.cl index 949fa4999..d432b4f4f 100644 --- a/OpenCL/m01740_a3-optimized.cl +++ b/OpenCL/m01740_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m08000_a0-optimized.cl b/OpenCL/m08000_a0-optimized.cl index 873d4805b..67bdf8d61 100644 --- a/OpenCL/m08000_a0-optimized.cl +++ b/OpenCL/m08000_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); @@ -143,7 +143,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_STEP_Z (16); ROUND_STEP_Z (32); ROUND_STEP_Z (48); diff --git a/OpenCL/m08000_a1-optimized.cl b/OpenCL/m08000_a1-optimized.cl index e05eb37f2..01c925243 100644 --- a/OpenCL/m08000_a1-optimized.cl +++ b/OpenCL/m08000_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); @@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_STEP_Z (16); ROUND_STEP_Z (32); ROUND_STEP_Z (48); diff --git a/OpenCL/m08000_a3-optimized.cl b/OpenCL/m08000_a3-optimized.cl index f62608a0e..14ee12d12 100644 --- a/OpenCL/m08000_a3-optimized.cl +++ b/OpenCL/m08000_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w) ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); @@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest) ROUND_STEP_Z (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_STEP_Z (16); ROUND_STEP_Z (32); ROUND_STEP_Z (48); diff --git a/OpenCL/m10800_a0-optimized.cl b/OpenCL/m10800_a0-optimized.cl index 7bbb0cd51..26d5ac84f 100644 --- a/OpenCL/m10800_a0-optimized.cl +++ b/OpenCL/m10800_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m10800_a1-optimized.cl b/OpenCL/m10800_a1-optimized.cl index ff3014167..f09627684 100644 --- a/OpenCL/m10800_a1-optimized.cl +++ b/OpenCL/m10800_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m10800_a3-optimized.cl b/OpenCL/m10800_a3-optimized.cl index 031ae5100..eb3a08a41 100644 --- a/OpenCL/m10800_a3-optimized.cl +++ b/OpenCL/m10800_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m21000_a0-optimized.cl b/OpenCL/m21000_a0-optimized.cl index 7b782c877..d0f88c06a 100644 --- a/OpenCL/m21000_a0-optimized.cl +++ b/OpenCL/m21000_a0-optimized.cl @@ -89,7 +89,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x * ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m21000_a1-optimized.cl b/OpenCL/m21000_a1-optimized.cl index ba792b588..f9110176d 100644 --- a/OpenCL/m21000_a1-optimized.cl +++ b/OpenCL/m21000_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); @@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x * ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m21000_a3-optimized.cl b/OpenCL/m21000_a3-optimized.cl index f03742d40..cdb3f0038 100644 --- a/OpenCL/m21000_a3-optimized.cl +++ b/OpenCL/m21000_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); @@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x * ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m22200_a0-optimized.cl b/OpenCL/m22200_a0-optimized.cl index cafa7af7c..150380d44 100644 --- a/OpenCL/m22200_a0-optimized.cl +++ b/OpenCL/m22200_a0-optimized.cl @@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m22200_a1-optimized.cl b/OpenCL/m22200_a1-optimized.cl index fb40d5406..01d192b99 100644 --- a/OpenCL/m22200_a1-optimized.cl +++ b/OpenCL/m22200_a1-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/OpenCL/m22200_a3-optimized.cl b/OpenCL/m22200_a3-optimized.cl index 211522e7f..48cf61584 100644 --- a/OpenCL/m22200_a3-optimized.cl +++ b/OpenCL/m22200_a3-optimized.cl @@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32 ROUND_STEP (0); - #if defined IS_CUDA || defined IS_HIP + #if defined IS_CUDA ROUND_EXPAND (); ROUND_STEP (16); ROUND_EXPAND (); ROUND_STEP (32); ROUND_EXPAND (); ROUND_STEP (48); diff --git a/src/backend.c b/src/backend.c index 53de2d525..4caff74ce 100644 --- a/src/backend.c +++ b/src/backend.c @@ -8339,18 +8339,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) device_param->has_mov64 = false; device_param->has_prmt = false; - device_param->has_vadd = true; - device_param->has_vaddc = true; - device_param->has_vadd_co = true; - device_param->has_vaddc_co = true; - device_param->has_vsub = true; - device_param->has_vsubb = true; - device_param->has_vsub_co = true; - device_param->has_vsubb_co = true; - device_param->has_vadd3 = true; - device_param->has_vbfe = true; - device_param->has_vperm = true; - // device_available_mem HIPcontext hip_context; @@ -9528,7 +9516,27 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) if (backend_ctx->hip) { - // TODO HIP + // TODO HIP? + // Maybe all devices supported by hip have these instructions guaranteed? + + for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++) + { + hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt]; + + if (device_param->is_hip == false) continue; + + device_param->has_vadd = true; + device_param->has_vaddc = true; + device_param->has_vadd_co = true; + device_param->has_vaddc_co = true; + device_param->has_vsub = true; + device_param->has_vsubb = true; + device_param->has_vsub_co = true; + device_param->has_vsubb_co = true; + device_param->has_vadd3 = true; + device_param->has_vbfe = true; + device_param->has_vperm = true; + } } if (backend_ctx->ocl) @@ -10495,9 +10503,6 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p //hc_asprintf (&hiprtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor); - // TODO HIP - // no -offload-arch= aka --gpu-architecture because hiprtc gets native arch from hip_context - hiprtc_options[0] = "--gpu-max-threads-per-block=64"; hiprtc_options[1] = ""; hiprtc_options[2] = ""; diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index fc7efad2c..df6777441 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -13,7 +13,7 @@ my $amd_cache = "~/.AMD"; my $hashcat_path = "."; my $kernels_cache = "$hashcat_path/kernels"; my $hashcat_bin = "$hashcat_path/hashcat"; -my $device = 3; +my $device = 1; my $workload_profile = 3; my $runtime = 24; my $sleep_sec = 12;