Backport more ROCm based optimizations to HIP

pull/2883/head
Jens Steube 3 years ago
parent 2e929e692e
commit 45e65dd05a

@ -253,7 +253,7 @@ DECLSPEC u32 amd_bitalign_S (const u32 a, const u32 b, const int n)
{
u32 r = 0;
asm ("V_ALIGNBIT_B32 %0, %1, %2, %3;" : "=v"(r): "v"(a), "v"(b), "I"(n));
__asm__ ("V_ALIGNBIT_B32 %0, %1, %2, %3;" : "=v"(r): "v"(a), "v"(b), "I"(n));
return r;
}

@ -95,8 +95,6 @@
#define IS_GENERIC
#elif VENDOR_ID == (1 << 8)
#define IS_AMD_USE_HIP
// TODO HIP optimization potential
//#define IS_GENERIC
#else
#define IS_GENERIC
#endif
@ -158,10 +156,8 @@
#endif
#ifdef IS_HIP
//TODO HIP
//#define USE_BITSELECT
//#define USE_ROTATE
//#define USE_SWIZZLE
#define USE_BITSELECT
#define USE_ROTATE
#endif
#ifdef IS_ROCM

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
@ -143,7 +143,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
ROUND_STEP_Z (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_STEP_Z (16);
ROUND_STEP_Z (32);
ROUND_STEP_Z (48);

@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
ROUND_STEP_Z (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_STEP_Z (16);
ROUND_STEP_Z (32);
ROUND_STEP_Z (48);

@ -84,7 +84,7 @@ DECLSPEC void sha256_transform_m (u32x *digest, const u32x *w)
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
@ -141,7 +141,7 @@ DECLSPEC void sha256_transform_z (u32x *digest)
ROUND_STEP_Z (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_STEP_Z (16);
ROUND_STEP_Z (32);
ROUND_STEP_Z (48);

@ -86,7 +86,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha384_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -89,7 +89,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_full (const u32x *w0, const u32x *w1, const u32x
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);
@ -182,7 +182,7 @@ DECLSPEC void sha512_transform_opt (const u32x *w0, const u32x *w1, const u32x *
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -86,7 +86,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -84,7 +84,7 @@ DECLSPEC void sha512_transform_intern (const u32x *w0, const u32x *w1, const u32
ROUND_STEP (0);
#if defined IS_CUDA || defined IS_HIP
#if defined IS_CUDA
ROUND_EXPAND (); ROUND_STEP (16);
ROUND_EXPAND (); ROUND_STEP (32);
ROUND_EXPAND (); ROUND_STEP (48);

@ -8339,18 +8339,6 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
device_param->has_mov64 = false;
device_param->has_prmt = false;
device_param->has_vadd = true;
device_param->has_vaddc = true;
device_param->has_vadd_co = true;
device_param->has_vaddc_co = true;
device_param->has_vsub = true;
device_param->has_vsubb = true;
device_param->has_vsub_co = true;
device_param->has_vsubb_co = true;
device_param->has_vadd3 = true;
device_param->has_vbfe = true;
device_param->has_vperm = true;
// device_available_mem
HIPcontext hip_context;
@ -9528,7 +9516,27 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if (backend_ctx->hip)
{
// TODO HIP
// TODO HIP?
// Maybe all devices supported by hip have these instructions guaranteed?
for (int backend_devices_cnt = 0; backend_devices_cnt < backend_ctx->backend_devices_cnt; backend_devices_cnt++)
{
hc_device_param_t *device_param = &backend_ctx->devices_param[backend_devices_cnt];
if (device_param->is_hip == false) continue;
device_param->has_vadd = true;
device_param->has_vaddc = true;
device_param->has_vadd_co = true;
device_param->has_vaddc_co = true;
device_param->has_vsub = true;
device_param->has_vsubb = true;
device_param->has_vsub_co = true;
device_param->has_vsubb_co = true;
device_param->has_vadd3 = true;
device_param->has_vbfe = true;
device_param->has_vperm = true;
}
}
if (backend_ctx->ocl)
@ -10495,9 +10503,6 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p
//hc_asprintf (&hiprtc_options[3], "compute_%d%d", device_param->sm_major, device_param->sm_minor);
// TODO HIP
// no -offload-arch= aka --gpu-architecture because hiprtc gets native arch from hip_context
hiprtc_options[0] = "--gpu-max-threads-per-block=64";
hiprtc_options[1] = "";
hiprtc_options[2] = "";

@ -13,7 +13,7 @@ my $amd_cache = "~/.AMD";
my $hashcat_path = ".";
my $kernels_cache = "$hashcat_path/kernels";
my $hashcat_bin = "$hashcat_path/hashcat";
my $device = 3;
my $device = 1;
my $workload_profile = 3;
my $runtime = 24;
my $sleep_sec = 12;

Loading…
Cancel
Save