Backport inc_ecc_secp256k1 inline assembly code for AMD ISA

pull/2246/head
Jens Steube 5 years ago
parent cb24236067
commit 53254b45aa

@ -108,20 +108,40 @@ DECLSPEC u32 sub (u32 r[8], const u32 a[8], const u32 b[8])
{
u32 c = 0; // carry/borrow
#ifdef IS_NV
asm("sub.cc.u32 %0, %9, %17;"
"subc.cc.u32 %1, %10, %18;"
"subc.cc.u32 %2, %11, %19;"
"subc.cc.u32 %3, %12, %20;"
"subc.cc.u32 %4, %13, %21;"
"subc.cc.u32 %5, %14, %22;"
"subc.cc.u32 %6, %15, %23;"
"subc.cc.u32 %7, %16, %24;"
"subc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]));
#if defined IS_NV && HAS_SUB == 1 && HAS_SUBC == 1
asm volatile
(
"sub.cc.u32 %0, %9, %17;"
"subc.cc.u32 %1, %10, %18;"
"subc.cc.u32 %2, %11, %19;"
"subc.cc.u32 %3, %12, %20;"
"subc.cc.u32 %4, %13, %21;"
"subc.cc.u32 %5, %14, %22;"
"subc.cc.u32 %6, %15, %23;"
"subc.cc.u32 %7, %16, %24;"
"subc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif defined IS_AMD && HAS_VSUB == 1 && HAS_VSUBB == 1
__asm__ __volatile__
(
"V_SUB_U32 %0, %9, %17;"
"V_SUBB_U32 %1, %10, %18;"
"V_SUBB_U32 %2, %11, %19;"
"V_SUBB_U32 %3, %12, %20;"
"V_SUBB_U32 %4, %13, %21;"
"V_SUBB_U32 %5, %14, %22;"
"V_SUBB_U32 %6, %15, %23;"
"V_SUBB_U32 %7, %16, %24;"
"V_SUBB_U32 %8, 0, 0;"
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
"=v"(c)
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
);
#else
for (u32 i = 0; i < 8; i++)
{
@ -140,20 +160,40 @@ DECLSPEC u32 add (u32 r[8], const u32 a[8], const u32 b[8])
{
u32 c = 0; // carry/borrow
#ifdef IS_NV
asm("add.cc.u32 %0, %9, %17;"
"addc.cc.u32 %1, %10, %18;"
"addc.cc.u32 %2, %11, %19;"
"addc.cc.u32 %3, %12, %20;"
"addc.cc.u32 %4, %13, %21;"
"addc.cc.u32 %5, %14, %22;"
"addc.cc.u32 %6, %15, %23;"
"addc.cc.u32 %7, %16, %24;"
"addc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7]));
#if defined IS_NV && HAS_ADD == 1 && HAS_ADDC == 1
asm volatile
(
"add.cc.u32 %0, %9, %17;"
"addc.cc.u32 %1, %10, %18;"
"addc.cc.u32 %2, %11, %19;"
"addc.cc.u32 %3, %12, %20;"
"addc.cc.u32 %4, %13, %21;"
"addc.cc.u32 %5, %14, %22;"
"addc.cc.u32 %6, %15, %23;"
"addc.cc.u32 %7, %16, %24;"
"addc.u32 %8, 0, 0;"
: "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]), "=r"(r[4]), "=r"(r[5]), "=r"(r[6]), "=r"(r[7]),
"=r"(c)
: "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
"r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
);
#elif defined IS_AMD && HAS_VADD == 1 && HAS_VADDC == 1
__asm__ __volatile__
(
"V_ADD_U32 %0, %9, %17;"
"V_ADDC_U32 %1, %10, %18;"
"V_ADDC_U32 %2, %11, %19;"
"V_ADDC_U32 %3, %12, %20;"
"V_ADDC_U32 %4, %13, %21;"
"V_ADDC_U32 %5, %14, %22;"
"V_ADDC_U32 %6, %15, %23;"
"V_ADDC_U32 %7, %16, %24;"
"V_ADDC_U32 %8, 0, 0;"
: "=v"(r[0]), "=v"(r[1]), "=v"(r[2]), "=v"(r[3]), "=v"(r[4]), "=v"(r[5]), "=v"(r[6]), "=v"(r[7]),
"=v"(c)
: "v"(a[0]), "v"(a[1]), "v"(a[2]), "v"(a[3]), "v"(a[4]), "v"(a[5]), "v"(a[6]), "v"(a[7]),
"v"(b[0]), "v"(b[1]), "v"(b[2]), "v"(b[3]), "v"(b[4]), "v"(b[5]), "v"(b[6]), "v"(b[7])
);
#else
for (u32 i = 0; i < 8; i++)
{

@ -1238,11 +1238,19 @@ typedef struct hc_device_param
hc_timer_t timer_speed;
// AMD
bool has_vadd;
bool has_vaddc;
bool has_vsub;
bool has_vsubb;
bool has_vadd3;
bool has_vbfe;
bool has_vperm;
// NV
bool has_add;
bool has_addc;
bool has_sub;
bool has_subc;
bool has_bfe;
bool has_lop3;
bool has_mov64;

@ -5432,6 +5432,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
//
//if (rc_cuCtxSetCacheConfig == -1) return -1;
device_param->has_add = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_addc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_sub = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_subc = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_bfe = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_lop3 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned int r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_mov64 = cuda_test_instruction (hashcat_ctx, sm_major, sm_minor, "__global__ void test () { unsigned long long r; unsigned int a; unsigned int b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");
@ -6132,6 +6136,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_AMD))
{
device_param->has_vadd = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_ADD_U32 %0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vaddc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_ADDC_U32 %0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vsub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_SUB_U32 %0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vsubb = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_SUBB_U32 %0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vadd3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_ADD3_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vbfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_BFE_U32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
device_param->has_vperm = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; __asm__ __volatile__ (\"V_PERM_B32 %0, 0, 0, 0;\" : \"=v\"(r)); }");
@ -6139,6 +6147,10 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime)
if ((device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) && (device_param->opencl_platform_vendor_id == VENDOR_ID_NV))
{
device_param->has_add = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"add.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_addc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"addc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_sub = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"sub.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_subc = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"subc.cc.u32 %0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_bfe = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"bfe.u32 %0, 0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_lop3 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { uint r; asm volatile (\"lop3.b32 %0, 0, 0, 0, 0;\" : \"=r\"(r)); }");
device_param->has_mov64 = opencl_test_instruction (hashcat_ctx, context, device_param->opencl_device, "__kernel void test () { ulong r; uint a; uint b; asm volatile (\"mov.b64 %0, {%1, %2};\" : \"=l\"(r) : \"r\"(a), \"r\"(b)); }");
@ -7007,9 +7019,9 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx)
// we don't have sm_* on vendors not NV but it doesn't matter
#if defined (DEBUG)
build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vsub, device_param->has_vsubb, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
#else
build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
build_options_len += snprintf (build_options_buf + build_options_len, build_options_sz - build_options_len, "-D LOCAL_MEM_TYPE=%d -D VENDOR_ID=%u -D CUDA_ARCH=%u -D HAS_ADD=%u -D HAS_ADDC=%u -D HAS_SUB=%u -D HAS_SUBC=%u -D HAS_VADD=%u -D HAS_VADDC=%u -D HAS_VSUB=%u -D HAS_VSUBB=%u -D HAS_VPERM=%u -D HAS_VADD3=%u -D HAS_VBFE=%u -D HAS_BFE=%u -D HAS_LOP3=%u -D HAS_MOV64=%u -D HAS_PRMT=%u -D VECT_SIZE=%d -D DEVICE_TYPE=%u -D DGST_R0=%u -D DGST_R1=%u -D DGST_R2=%u -D DGST_R3=%u -D DGST_ELEM=%u -D KERN_TYPE=%u -D ATTACK_EXEC=%u -D ATTACK_KERN=%u -D _unroll -w ", device_param->device_local_mem_type, device_param->opencl_platform_vendor_id, (device_param->sm_major * 100) + (device_param->sm_minor * 10), device_param->has_add, device_param->has_addc, device_param->has_sub, device_param->has_subc, device_param->has_vadd, device_param->has_vaddc, device_param->has_vsub, device_param->has_vsubb, device_param->has_vperm, device_param->has_vadd3, device_param->has_vbfe, device_param->has_bfe, device_param->has_lop3, device_param->has_mov64, device_param->has_prmt, device_param->vector_width, (u32) device_param->opencl_device_type, hashconfig->dgst_pos0, hashconfig->dgst_pos1, hashconfig->dgst_pos2, hashconfig->dgst_pos3, hashconfig->dgst_size / 4, kern_type, hashconfig->attack_exec, user_options_extra->attack_kern);
#endif
build_options_buf[build_options_len] = 0;

Loading…
Cancel
Save