From 51372438fe8a559698641e046c402a0364ee0bcb Mon Sep 17 00:00:00 2001 From: jsteube Date: Tue, 22 Aug 2017 18:47:53 +0200 Subject: [PATCH] Allow OpenCL kernel inline assembly if ROCm drivers was detected --- OpenCL/inc_common.cl | 23 ++++++++++++--- OpenCL/inc_types.cl | 67 ++++++++++++++++++++++++++++++++++++++++++++ OpenCL/inc_vendor.cl | 8 ++++-- include/types.h | 2 ++ src/opencl.c | 6 ++-- 5 files changed, 98 insertions(+), 8 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 53b23989b..9efbec58b 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2968,7 +2968,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD || defined IS_GENERIC + #if defined IS_AMD_LEGACY || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); @@ -3327,8 +3327,15 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[3] = swap32 (w3[3]); #endif - #ifdef IS_NV + #if defined IS_AMD_ROCM || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD_ROCM + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif switch (offset / 4) { @@ -3652,6 +3659,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x break; } + #endif } @@ -32192,7 +32200,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD || defined IS_GENERIC + #if defined IS_AMD_LEGACY || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); @@ -32551,8 +32559,15 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3[3] = swap32_S (w3[3]); #endif - #ifdef IS_NV + #if defined IS_AMD_ROCM || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if defined IS_AMD_ROCM + const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); + #endif switch (offset / 4) { diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index cfc05ddca..f4a997bb0 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -272,6 +272,73 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { return amd_bytealign (a, b, c); } + +#ifdef IS_AMD_ROCM +static u32x __byte_perm (const u32x a, const u32x b, const u32x c) +{ + u32x r; + + #if VECT_SIZE == 1 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c)); + #endif + + #if VECT_SIZE >= 2 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + #endif + + #if VECT_SIZE >= 4 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + #endif + + #if VECT_SIZE >= 8 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); + #endif + + #if VECT_SIZE >= 16 + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se)); + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf)); + #endif + + return r; +} +#endif + +#ifdef IS_AMD_ROCM +static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + __asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c)); + + return r; +} +#endif + #endif #ifdef IS_NV diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index 4fc141017..b608854bf 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -30,8 +30,13 @@ */ #if VENDOR_ID == (1 << 0) +#if AMD_ROCM == 0 #define IS_AMD -//#define IS_GENERIC +#define IS_AMD_LEGACY +#else +#define IS_AMD +#define IS_AMD_ROCM +#endif #elif VENDOR_ID == (1 << 1) #define IS_APPLE #define IS_GENERIC @@ -46,7 +51,6 @@ #define IS_GENERIC #elif VENDOR_ID == (1 << 5) #define IS_NV -//#define IS_GENERIC #elif VENDOR_ID == (1 << 6) #define IS_POCL #define IS_GENERIC diff --git a/include/types.h b/include/types.h index b2b39004f..ce37115fb 100644 --- a/include/types.h +++ b/include/types.h @@ -1029,6 +1029,8 @@ typedef struct hc_device_param char *driver_version; char *device_opencl_version; + bool is_rocm; + double nvidia_spin_damp; cl_platform_id platform; diff --git a/src/opencl.c b/src/opencl.c index e1a4babc0..3096b9a2a 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -3349,6 +3349,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) { // Support for ROCm platform if (atof (device_param->driver_version) >= 1.1) amd_warn = false; + + device_param->is_rocm = true; } #elif defined (_WIN) // AMD Radeon Software 14.9 and higher, should be updated to 15.12 @@ -4273,9 +4275,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx) char build_opts_new[1024] = { 0 }; #if defined (DEBUG) - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D AMD_ROCM=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->is_rocm, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #else - snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); + snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D AMD_ROCM=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->is_rocm, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type); #endif if (device_param->device_type & CL_DEVICE_TYPE_CPU)