OpenCL Runtime: Workaround JiT compiler segfault on legacy AMDGPU driver compiling RAR3 OpenCL kernel

pull/2639/head^2
Jens Steube 3 years ago
parent ff72a8ed21
commit e4dab0f1bf

@ -1431,13 +1431,12 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
{
u32x r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a << 24) | (b >> 8); }
else if (cm == 2) { r = (a << 16) | (b >> 16); }
else if (cm == 3) { r = (a << 8) | (b >> 24); }
return r;
}
@ -1446,13 +1445,12 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
u32 r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a << 24) | (b >> 8); }
else if (cm == 2) { r = (a << 16) | (b >> 16); }
else if (cm == 3) { r = (a << 8) | (b >> 24); }
return r;
}
@ -1461,13 +1459,12 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
u32x r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a >> 24) | (b << 8); }
else if (cm == 2) { r = (a >> 16) | (b << 16); }
else if (cm == 3) { r = (a >> 8) | (b << 24); }
return r;
}
@ -1476,13 +1473,12 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
u32 r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a >> 24) | (b << 8); }
else if (cm == 2) { r = (a >> 16) | (b << 16); }
else if (cm == 3) { r = (a >> 8) | (b << 24); }
return r;
}
@ -1845,13 +1841,12 @@ DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const int c)
{
u32x r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a << 24) | (b >> 8); }
else if (cm == 2) { r = (a << 16) | (b >> 16); }
else if (cm == 3) { r = (a << 8) | (b >> 24); }
return r;
}
@ -1860,13 +1855,12 @@ DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const int c)
{
u32 r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a << 24) | (b >> 8); }
else if (cm == 2) { r = (a << 16) | (b >> 16); }
else if (cm == 3) { r = (a << 8) | (b >> 24); }
return r;
}
@ -1875,13 +1869,12 @@ DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const int c)
{
u32x r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a >> 24) | (b << 8); }
else if (cm == 2) { r = (a >> 16) | (b << 16); }
else if (cm == 3) { r = (a >> 8) | (b << 24); }
return r;
}
@ -1890,13 +1883,12 @@ DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const int c)
{
u32 r = 0;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
const int cm = c & 3;
if (cm == 0) { r = b; }
else if (cm == 1) { r = (a >> 24) | (b << 8); }
else if (cm == 2) { r = (a >> 16) | (b << 16); }
else if (cm == 3) { r = (a >> 8) | (b << 24); }
return r;
}

@ -152,23 +152,15 @@ KERNEL_FQ void m12500_loop (KERN_ATTR_TMPS (rar3_tmp_t))
u32 tmp0 = 0;
u32 tmp1 = 0;
switch (k & 3)
{
case 0: tmp0 = iter_s >> 0; mask0 = 0x0000ffff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 1: tmp0 = iter_s >> 8; mask0 = 0xff0000ff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 2: tmp0 = iter_s >> 16; mask0 = 0xffff0000;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 3: tmp0 = iter_s >> 24; mask0 = 0xffffff00;
tmp1 = iter_s << 8; mask1 = 0x00ffffff;
break;
}
const int kd = k / 4;
const int km = k & 3;
if (km == 0) { tmp0 = iter_s >> 0; tmp1 = 0; mask0 = 0x0000ffff; mask1 = 0xffffffff; }
else if (km == 1) { tmp0 = iter_s >> 8; tmp1 = 0; mask0 = 0xff0000ff; mask1 = 0xffffffff; }
else if (km == 2) { tmp0 = iter_s >> 16; tmp1 = 0; mask0 = 0xffff0000; mask1 = 0xffffffff; }
else if (km == 3) { tmp0 = iter_s >> 24; tmp1 = iter_s << 8; mask0 = 0xffffff00; mask1 = 0x00ffffff; }
switch (k / 4)
switch (kd)
{
case 0: w[ 0] = (w[ 0] & mask0) | tmp0;
w[ 1] = (w[ 1] & mask1) | tmp1;

@ -53,54 +53,22 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
switch (div)
{
case 0: w0[0] |= tmp0;
w0[1] = tmp1;
break;
case 1: w0[1] |= tmp0;
w0[2] = tmp1;
break;
case 2: w0[2] |= tmp0;
w0[3] = tmp1;
break;
case 3: w0[3] |= tmp0;
w1[0] = tmp1;
break;
case 4: w1[0] |= tmp0;
w1[1] = tmp1;
break;
case 5: w1[1] |= tmp0;
w1[2] = tmp1;
break;
case 6: w1[2] |= tmp0;
w1[3] = tmp1;
break;
case 7: w1[3] |= tmp0;
w2[0] = tmp1;
break;
case 8: w2[0] |= tmp0;
w2[1] = tmp1;
break;
case 9: w2[1] |= tmp0;
w2[2] = tmp1;
break;
case 10: w2[2] |= tmp0;
w2[3] = tmp1;
break;
case 11: w2[3] |= tmp0;
w3[0] = tmp1;
break;
case 12: w3[0] |= tmp0;
w3[1] = tmp1;
break;
case 13: w3[1] |= tmp0;
w3[2] = tmp1;
break;
case 14: w3[2] |= tmp0;
w3[3] = tmp1;
break;
case 15: w3[3] |= tmp0;
carry = tmp1;
break;
case 0: w0[0] |= tmp0; w0[1] = tmp1; break;
case 1: w0[1] |= tmp0; w0[2] = tmp1; break;
case 2: w0[2] |= tmp0; w0[3] = tmp1; break;
case 3: w0[3] |= tmp0; w1[0] = tmp1; break;
case 4: w1[0] |= tmp0; w1[1] = tmp1; break;
case 5: w1[1] |= tmp0; w1[2] = tmp1; break;
case 6: w1[2] |= tmp0; w1[3] = tmp1; break;
case 7: w1[3] |= tmp0; w2[0] = tmp1; break;
case 8: w2[0] |= tmp0; w2[1] = tmp1; break;
case 9: w2[1] |= tmp0; w2[2] = tmp1; break;
case 10: w2[2] |= tmp0; w2[3] = tmp1; break;
case 11: w2[3] |= tmp0; w3[0] = tmp1; break;
case 12: w3[0] |= tmp0; w3[1] = tmp1; break;
case 13: w3[1] |= tmp0; w3[2] = tmp1; break;
case 14: w3[2] |= tmp0; w3[3] = tmp1; break;
default: w3[3] |= tmp0; carry = tmp1; break; // this is a bit weird but helps to workaround AMD JiT compiler segfault if set to case 15:
}
const u32 new_len = func_len + 3;

@ -589,23 +589,15 @@ KERNEL_FQ void m23700_loop (KERN_ATTR_TMPS_ESALT (rar3_tmp_t, rar3_t))
u32 tmp0 = 0;
u32 tmp1 = 0;
switch (k & 3)
{
case 0: tmp0 = iter_s >> 0; mask0 = 0x0000ffff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 1: tmp0 = iter_s >> 8; mask0 = 0xff0000ff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 2: tmp0 = iter_s >> 16; mask0 = 0xffff0000;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 3: tmp0 = iter_s >> 24; mask0 = 0xffffff00;
tmp1 = iter_s << 8; mask1 = 0x00ffffff;
break;
}
const int kd = k / 4;
const int km = k & 3;
if (km == 0) { tmp0 = iter_s >> 0; tmp1 = 0; mask0 = 0x0000ffff; mask1 = 0xffffffff; }
else if (km == 1) { tmp0 = iter_s >> 8; tmp1 = 0; mask0 = 0xff0000ff; mask1 = 0xffffffff; }
else if (km == 2) { tmp0 = iter_s >> 16; tmp1 = 0; mask0 = 0xffff0000; mask1 = 0xffffffff; }
else if (km == 3) { tmp0 = iter_s >> 24; tmp1 = iter_s << 8; mask0 = 0xffffff00; mask1 = 0x00ffffff; }
switch (k / 4)
switch (kd)
{
case 0: w[ 0] = (w[ 0] & mask0) | tmp0;
w[ 1] = (w[ 1] & mask1) | tmp1;

@ -161,54 +161,22 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
switch (div)
{
case 0: w0[0] |= tmp0;
w0[1] = tmp1;
break;
case 1: w0[1] |= tmp0;
w0[2] = tmp1;
break;
case 2: w0[2] |= tmp0;
w0[3] = tmp1;
break;
case 3: w0[3] |= tmp0;
w1[0] = tmp1;
break;
case 4: w1[0] |= tmp0;
w1[1] = tmp1;
break;
case 5: w1[1] |= tmp0;
w1[2] = tmp1;
break;
case 6: w1[2] |= tmp0;
w1[3] = tmp1;
break;
case 7: w1[3] |= tmp0;
w2[0] = tmp1;
break;
case 8: w2[0] |= tmp0;
w2[1] = tmp1;
break;
case 9: w2[1] |= tmp0;
w2[2] = tmp1;
break;
case 10: w2[2] |= tmp0;
w2[3] = tmp1;
break;
case 11: w2[3] |= tmp0;
w3[0] = tmp1;
break;
case 12: w3[0] |= tmp0;
w3[1] = tmp1;
break;
case 13: w3[1] |= tmp0;
w3[2] = tmp1;
break;
case 14: w3[2] |= tmp0;
w3[3] = tmp1;
break;
case 15: w3[3] |= tmp0;
carry = tmp1;
break;
case 0: w0[0] |= tmp0; w0[1] = tmp1; break;
case 1: w0[1] |= tmp0; w0[2] = tmp1; break;
case 2: w0[2] |= tmp0; w0[3] = tmp1; break;
case 3: w0[3] |= tmp0; w1[0] = tmp1; break;
case 4: w1[0] |= tmp0; w1[1] = tmp1; break;
case 5: w1[1] |= tmp0; w1[2] = tmp1; break;
case 6: w1[2] |= tmp0; w1[3] = tmp1; break;
case 7: w1[3] |= tmp0; w2[0] = tmp1; break;
case 8: w2[0] |= tmp0; w2[1] = tmp1; break;
case 9: w2[1] |= tmp0; w2[2] = tmp1; break;
case 10: w2[2] |= tmp0; w2[3] = tmp1; break;
case 11: w2[3] |= tmp0; w3[0] = tmp1; break;
case 12: w3[0] |= tmp0; w3[1] = tmp1; break;
case 13: w3[1] |= tmp0; w3[2] = tmp1; break;
case 14: w3[2] |= tmp0; w3[3] = tmp1; break;
default: w3[3] |= tmp0; carry = tmp1; break; // this is a bit weird but helps to workaround AMD JiT compiler segfault if set to case 15:
}
const u32 new_len = func_len + 3;

@ -169,23 +169,15 @@ KERNEL_FQ void m23800_loop (KERN_ATTR_TMPS_HOOKS_ESALT (rar3_tmp_t, rar3_hook_t,
u32 tmp0 = 0;
u32 tmp1 = 0;
switch (k & 3)
{
case 0: tmp0 = iter_s >> 0; mask0 = 0x0000ffff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 1: tmp0 = iter_s >> 8; mask0 = 0xff0000ff;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 2: tmp0 = iter_s >> 16; mask0 = 0xffff0000;
tmp1 = 0; mask1 = 0xffffffff;
break;
case 3: tmp0 = iter_s >> 24; mask0 = 0xffffff00;
tmp1 = iter_s << 8; mask1 = 0x00ffffff;
break;
}
const int kd = k / 4;
const int km = k & 3;
if (km == 0) { tmp0 = iter_s >> 0; tmp1 = 0; mask0 = 0x0000ffff; mask1 = 0xffffffff; }
else if (km == 1) { tmp0 = iter_s >> 8; tmp1 = 0; mask0 = 0xff0000ff; mask1 = 0xffffffff; }
else if (km == 2) { tmp0 = iter_s >> 16; tmp1 = 0; mask0 = 0xffff0000; mask1 = 0xffffffff; }
else if (km == 3) { tmp0 = iter_s >> 24; tmp1 = iter_s << 8; mask0 = 0xffffff00; mask1 = 0x00ffffff; }
switch (k / 4)
switch (kd)
{
case 0: w[ 0] = (w[ 0] & mask0) | tmp0;
w[ 1] = (w[ 1] & mask1) | tmp1;

@ -72,54 +72,22 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
switch (div)
{
case 0: w0[0] |= tmp0;
w0[1] = tmp1;
break;
case 1: w0[1] |= tmp0;
w0[2] = tmp1;
break;
case 2: w0[2] |= tmp0;
w0[3] = tmp1;
break;
case 3: w0[3] |= tmp0;
w1[0] = tmp1;
break;
case 4: w1[0] |= tmp0;
w1[1] = tmp1;
break;
case 5: w1[1] |= tmp0;
w1[2] = tmp1;
break;
case 6: w1[2] |= tmp0;
w1[3] = tmp1;
break;
case 7: w1[3] |= tmp0;
w2[0] = tmp1;
break;
case 8: w2[0] |= tmp0;
w2[1] = tmp1;
break;
case 9: w2[1] |= tmp0;
w2[2] = tmp1;
break;
case 10: w2[2] |= tmp0;
w2[3] = tmp1;
break;
case 11: w2[3] |= tmp0;
w3[0] = tmp1;
break;
case 12: w3[0] |= tmp0;
w3[1] = tmp1;
break;
case 13: w3[1] |= tmp0;
w3[2] = tmp1;
break;
case 14: w3[2] |= tmp0;
w3[3] = tmp1;
break;
case 15: w3[3] |= tmp0;
carry = tmp1;
break;
case 0: w0[0] |= tmp0; w0[1] = tmp1; break;
case 1: w0[1] |= tmp0; w0[2] = tmp1; break;
case 2: w0[2] |= tmp0; w0[3] = tmp1; break;
case 3: w0[3] |= tmp0; w1[0] = tmp1; break;
case 4: w1[0] |= tmp0; w1[1] = tmp1; break;
case 5: w1[1] |= tmp0; w1[2] = tmp1; break;
case 6: w1[2] |= tmp0; w1[3] = tmp1; break;
case 7: w1[3] |= tmp0; w2[0] = tmp1; break;
case 8: w2[0] |= tmp0; w2[1] = tmp1; break;
case 9: w2[1] |= tmp0; w2[2] = tmp1; break;
case 10: w2[2] |= tmp0; w2[3] = tmp1; break;
case 11: w2[3] |= tmp0; w3[0] = tmp1; break;
case 12: w3[0] |= tmp0; w3[1] = tmp1; break;
case 13: w3[1] |= tmp0; w3[2] = tmp1; break;
case 14: w3[2] |= tmp0; w3[3] = tmp1; break;
default: w3[3] |= tmp0; carry = tmp1; break; // this is a bit weird but helps to workaround AMD JiT compiler segfault if set to case 15:
}
const u32 new_len = func_len + 3;

@ -78,6 +78,7 @@
- CUDA Backend: Give detailed warning if either the NVIDIA CUDA or the NVIDIA RTC library cannot be initialized
- CUDA Backend: Use blocking events to avoid 100% CPU core usage (per GPU)
- OpenCL Runtime: Workaround JiT compiler deadlock on NVIDIA driver >= 465.89
- OpenCL Runtime: Workaround JiT compiler segfault on legacy AMDGPU driver compiling RAR3 OpenCL kernel
- RAR3 Kernels: Improved loop code, improving performance by 23%
- Scrypt Kernels: Added a number of GPU specific optimizations per hash modes to hashcat.hctune
- Scrypt Kernels: Added detailed documentation on device specific tunings in hashcat.hctune

@ -59,19 +59,6 @@ typedef struct rar3_tmp_optimized
static const int ROUNDS_RAR3 = 262144;
static const char *SIGNATURE_RAR3 = "$RAR3$";
bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
{
// amdgpu-pro-20.50-1234664-ubuntu-20.04 (legacy)
// test_1619943729/test_report.log:! unhandled return code 255, cmdline : cat test_1619943729/12500_passwords.txt | ./hashcat --quiet --potfile-disable --runtime 400 --hwmon-disable -O -D 2 --backend-vector-width 1 -a 0 -m 12500 test_1619943729/12500_hashes.txt
// test_1619955152/test_report.log:! unhandled return code 255, cmdline : cat test_1619955152/12500_passwords.txt | ./hashcat --quiet --potfile-disable --runtime 400 --hwmon-disable -D 2 --backend-vector-width 4 -a 0 -m 12500 test_1619955152/12500_hashes.txt
if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
{
return true;
}
return false;
}
u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL);
@ -289,6 +276,6 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_st_hash = module_st_hash;
module_ctx->module_st_pass = module_st_pass;
module_ctx->module_tmp_size = module_tmp_size;
module_ctx->module_unstable_warning = module_unstable_warning;
module_ctx->module_unstable_warning = MODULE_DEFAULT;
module_ctx->module_warmup_disable = MODULE_DEFAULT;
}

@ -68,19 +68,6 @@ typedef struct rar3_tmp_optimized
static const int ROUNDS_RAR3 = 262144;
static const char *SIGNATURE_RAR3 = "$RAR3$";
bool module_unstable_warning (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hc_device_param_t *device_param)
{
// amdgpu-pro-20.50-1234664-ubuntu-20.04 (legacy)
// test_1619943729/test_report.log:! unhandled return code 139, cmdline : cat test_1619943729/23700_passwords.txt | ./hashcat --quiet --potfile-disable --runtime 400 --hwmon-disable -O -D 2 --backend-vector-width 1 -a 0 -m 23700 test_1619943729/23700_hashes.txt
// test_1619955152/test_report.log:! unhandled return code 139, cmdline : cat test_1619955152/23700_passwords.txt | ./hashcat --quiet --potfile-disable --runtime 400 --hwmon-disable -D 2 --backend-vector-width 4 -a 0 -m 23700 test_1619955152/23700_hashes.txt
if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == false))
{
return true;
}
return false;
}
u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const bool optimized_kernel = user_options->optimized_kernel_enable;
@ -405,6 +392,6 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_st_hash = module_st_hash;
module_ctx->module_st_pass = module_st_pass;
module_ctx->module_tmp_size = module_tmp_size;
module_ctx->module_unstable_warning = module_unstable_warning;
module_ctx->module_unstable_warning = MODULE_DEFAULT;
module_ctx->module_warmup_disable = MODULE_DEFAULT;
}

Loading…
Cancel
Save