Allow OpenCL kernel inline assembly if ROCm drivers was detected

pull/1342/head^2
jsteube 7 years ago
parent 7b71fb803b
commit 51372438fe

@ -2968,7 +2968,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
#if defined IS_AMD_LEGACY || defined IS_GENERIC
w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]);
@ -3327,8 +3327,15 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
w3[3] = swap32 (w3[3]);
#endif
#ifdef IS_NV
#if defined IS_AMD_ROCM || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
switch (offset / 4)
{
@ -3652,6 +3659,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
break;
}
#endif
}
@ -32192,7 +32200,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
#if defined IS_AMD_LEGACY || defined IS_GENERIC
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
@ -32551,8 +32559,15 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
w3[3] = swap32_S (w3[3]);
#endif
#ifdef IS_NV
#if defined IS_AMD_ROCM || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
switch (offset / 4)
{

@ -272,6 +272,73 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
{
return amd_bytealign (a, b, c);
}
#ifdef IS_AMD_ROCM
static u32x __byte_perm (const u32x a, const u32x b, const u32x c)
{
u32x r;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s0) : "v"(b.s0), "v"(a.s0), "v"(c.s0));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s1) : "v"(b.s1), "v"(a.s1), "v"(c.s1));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s2) : "v"(b.s2), "v"(a.s2), "v"(c.s2));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s3) : "v"(b.s3), "v"(a.s3), "v"(c.s3));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s4) : "v"(b.s4), "v"(a.s4), "v"(c.s4));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s5) : "v"(b.s5), "v"(a.s5), "v"(c.s5));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s6) : "v"(b.s6), "v"(a.s6), "v"(c.s6));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s7) : "v"(b.s7), "v"(a.s7), "v"(c.s7));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s8) : "v"(b.s8), "v"(a.s8), "v"(c.s8));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.s9) : "v"(b.s9), "v"(a.s9), "v"(c.s9));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sa) : "v"(b.sa), "v"(a.sa), "v"(c.sa));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sb) : "v"(b.sb), "v"(a.sb), "v"(c.sb));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sc) : "v"(b.sc), "v"(a.sc), "v"(c.sc));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sd) : "v"(b.sd), "v"(a.sd), "v"(c.sd));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.se) : "v"(b.se), "v"(a.se), "v"(c.se));
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r.sf) : "v"(b.sf), "v"(a.sf), "v"(c.sf));
#endif
return r;
}
#endif
#ifdef IS_AMD_ROCM
static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
__asm__ volatile ("V_PERM_B32 %0, %1, %2, %3;" : "=v"(r) : "v"(b), "v"(a), "v"(c));
return r;
}
#endif
#endif
#ifdef IS_NV

@ -30,8 +30,13 @@
*/
#if VENDOR_ID == (1 << 0)
#if AMD_ROCM == 0
#define IS_AMD
//#define IS_GENERIC
#define IS_AMD_LEGACY
#else
#define IS_AMD
#define IS_AMD_ROCM
#endif
#elif VENDOR_ID == (1 << 1)
#define IS_APPLE
#define IS_GENERIC
@ -46,7 +51,6 @@
#define IS_GENERIC
#elif VENDOR_ID == (1 << 5)
#define IS_NV
//#define IS_GENERIC
#elif VENDOR_ID == (1 << 6)
#define IS_POCL
#define IS_GENERIC

@ -1029,6 +1029,8 @@ typedef struct hc_device_param
char *driver_version;
char *device_opencl_version;
bool is_rocm;
double nvidia_spin_damp;
cl_platform_id platform;

@ -3349,6 +3349,8 @@ int opencl_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
{
// Support for ROCm platform
if (atof (device_param->driver_version) >= 1.1) amd_warn = false;
device_param->is_rocm = true;
}
#elif defined (_WIN)
// AMD Radeon Software 14.9 and higher, should be updated to 15.12
@ -4273,9 +4275,9 @@ int opencl_session_begin (hashcat_ctx_t *hashcat_ctx)
char build_opts_new[1024] = { 0 };
#if defined (DEBUG)
snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type);
snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D AMD_ROCM=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->is_rocm, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type);
#else
snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type);
snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -D VENDOR_ID=%u -D CUDA_ARCH=%u -D AMD_ROCM=%u -D VECT_SIZE=%u -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D _unroll -w", build_opts, device_param->platform_vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->is_rocm, device_param->vector_width, (u32) device_param->device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, hashconfig->kern_type);
#endif
if (device_param->device_type & CL_DEVICE_TYPE_CPU)

Loading…
Cancel
Save