From 8c5c225d8f009148decdf7097365e44d9928fe0a Mon Sep 17 00:00:00 2001 From: unix-ninja Date: Mon, 22 Oct 2018 13:27:35 -0400 Subject: [PATCH] Optimize performance on NVIDIA GTX --- OpenCL/m18100_a0-pure.cl | 70 +++++++++++++++++----------------- OpenCL/m18100_a1-pure.cl | 70 +++++++++++++++++----------------- OpenCL/m18100_a3-pure.cl | 82 +++++++++++++++++++--------------------- 3 files changed, 110 insertions(+), 112 deletions(-) diff --git a/OpenCL/m18100_a0-pure.cl b/OpenCL/m18100_a0-pure.cl index bb0894e19..9c589b1b3 100644 --- a/OpenCL/m18100_a0-pure.cl +++ b/OpenCL/m18100_a0-pure.cl @@ -59,29 +59,30 @@ __kernel void m18100_mxx (__global pw_t *pws, __constant const kernel_rule_t *ru sha1_hmac_final (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; // we want to generate only 6 digits of code @@ -152,29 +153,30 @@ __kernel void m18100_sxx (__global pw_t *pws, __constant const kernel_rule_t *ru sha1_hmac_final (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; // we want to generate only 6 digits of code diff --git a/OpenCL/m18100_a1-pure.cl b/OpenCL/m18100_a1-pure.cl index 815eec778..940a7c5da 100644 --- a/OpenCL/m18100_a1-pure.cl +++ b/OpenCL/m18100_a1-pure.cl @@ -82,29 +82,30 @@ __kernel void m18100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; // we want to generate only 6 digits of code @@ -200,29 +201,30 @@ __kernel void m18100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; // we want to generate only 6 digits of code diff --git a/OpenCL/m18100_a3-pure.cl b/OpenCL/m18100_a3-pure.cl index a71516aa8..d622be763 100644 --- a/OpenCL/m18100_a3-pure.cl +++ b/OpenCL/m18100_a3-pure.cl @@ -68,39 +68,36 @@ __kernel void m18100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final_vector (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; + // we want to generate only 6 digits of code otp_code %= 1000000; - const u32x r0 = ctx.opad.h[DGST_R0]; - const u32x r1 = ctx.opad.h[DGST_R1]; - const u32x r2 = ctx.opad.h[DGST_R2]; - const u32x r3 = ctx.opad.h[DGST_R3]; - COMPARE_M_SIMD (otp_code, 0, 0, 0); } } @@ -172,39 +169,36 @@ __kernel void m18100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule sha1_hmac_final_vector (&ctx); - // calculate the offset using the least 4 bits of the last byte of our hash - const u32x otp_offset = ctx.opad.h[4] & 0xf; - // initialize a buffer for the otp code u32 otp_code = 0; // grab 4 consecutive bytes of the hash, starting at offset - // on some systems, &3 is faster than %4, so we will use it in our switch() - switch (otp_offset & 3) + switch (ctx.opad.h[4] & 15) { - case 1: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x00ffffff) << 8) | ((ctx.opad.h[otp_offset/4+1] & 0xff000000) >> 24); - break; - case 2: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x0000ffff) << 16) | ((ctx.opad.h[otp_offset/4+1] & 0xffff0000) >> 16); - break; - case 3: - otp_code = ((ctx.opad.h[otp_offset/4] & 0x000000ff) << 24) | ((ctx.opad.h[otp_offset/4+1] & 0xffffff00) >> 8); - break; - default: - otp_code = ctx.opad.h[otp_offset/4]; - break; + case 0: otp_code = ctx.opad.h[0]; break; + case 1: otp_code = ctx.opad.h[0] << 8 | ctx.opad.h[1] >> 24; break; + case 2: otp_code = ctx.opad.h[0] << 16 | ctx.opad.h[1] >> 16; break; + case 3: otp_code = ctx.opad.h[0] << 24 | ctx.opad.h[1] >> 8; break; + case 4: otp_code = ctx.opad.h[1]; break; + case 5: otp_code = ctx.opad.h[1] << 8 | ctx.opad.h[2] >> 24; break; + case 6: otp_code = ctx.opad.h[1] << 16 | ctx.opad.h[2] >> 16; break; + case 7: otp_code = ctx.opad.h[1] << 24 | ctx.opad.h[2] >> 8; break; + case 8: otp_code = ctx.opad.h[2]; break; + case 9: otp_code = ctx.opad.h[2] << 8 | ctx.opad.h[3] >> 24; break; + case 10: otp_code = ctx.opad.h[2] << 16 | ctx.opad.h[3] >> 16; break; + case 11: otp_code = ctx.opad.h[2] << 24 | ctx.opad.h[3] >> 8; break; + case 12: otp_code = ctx.opad.h[3]; break; + case 13: otp_code = ctx.opad.h[3] << 8 | ctx.opad.h[4] >> 24; break; + case 14: otp_code = ctx.opad.h[3] << 16 | ctx.opad.h[4] >> 16; break; + case 15: otp_code = ctx.opad.h[3] << 24 | ctx.opad.h[4] >> 8; break; } + // take only the lower 31 bits otp_code &= 0x7fffffff; + // we want to generate only 6 digits of code otp_code %= 1000000; - const u32x r0 = ctx.opad.h[DGST_R0]; - const u32x r1 = ctx.opad.h[DGST_R1]; - const u32x r2 = ctx.opad.h[DGST_R2]; - const u32x r3 = ctx.opad.h[DGST_R3]; - COMPARE_S_SIMD (otp_code, 0, 0, 0); } }