From 3a31b669b52d8bba5d61573c24f3d1593c160af8 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 24 Jul 2021 11:58:36 +0200 Subject: [PATCH] Optimize and clean up -m 25200 code. Speed on CPU: 1521 -> 2066 (35% faster) Speed on GPU: 16610 -> 157754 (9.4 times faster) Works for all password lengths --- OpenCL/m25200-pure.cl | 310 +++++++++++++++++++++++++++---------- src/modules/module_25200.c | 93 ++++++----- 2 files changed, 288 insertions(+), 115 deletions(-) diff --git a/OpenCL/m25200-pure.cl b/OpenCL/m25200-pure.cl index 0a2f4f521..f72fce044 100644 --- a/OpenCL/m25200-pure.cl +++ b/OpenCL/m25200-pure.cl @@ -17,25 +17,35 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -#define SNMPV3_ENGINEID_MAX 32 -#define SNMPV3_SALT_MAX 752 +#define SNMPV3_SALT_MAX 1500 +#define SNMPV3_ENGINEID_MAX 32 +#define SNMPV3_MSG_AUTH_PARAMS_MAX 12 +#define SNMPV3_ROUNDS 1048576 +#define SNMPV3_MAX_PW_LENGTH 64 + +#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32) +#define SNMPV3_HASH_ELEMS 8 // 8 = aligned 5 typedef struct hmac_sha1_tmp { - u32 idx; - sha1_ctx_t ctx; + u32 tmp[SNMPV3_TMP_ELEMS]; + u32 h[SNMPV3_HASH_ELEMS]; } hmac_sha1_tmp_t; +#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64 +#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64 +#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9 + typedef struct snmpv3 { - u32 salt_buf[SNMPV3_SALT_MAX]; + u32 salt_buf[SNMPV3_MAX_SALT_ELEMS]; u32 salt_len; - u8 engineID_buf[SNMPV3_ENGINEID_MAX]; + u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS]; u32 engineID_len; - u8 packet_number[8+1]; + u32 packet_number[SNMPV3_MAX_PNUM_ELEMS]; } snmpv3_t; @@ -53,35 +63,68 @@ KERNEL_FQ void m25200_init (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) * base */ - const GLOBAL_AS u8 *pw_buf = (GLOBAL_AS u8 *) pws[gid].i; - const u32 pw_len = pws[gid].pw_len; - /** - * authkey - */ + u32 w[64] = { 0 }; - u32 idx = 0; + for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) + { + w[idx] = pws[gid].i[idx]; + } - u32 buf[16] = { 0 }; + u8 *src_ptr = (u8 *) w; - u8 *tmp_buf = (u8 *) buf; + // password 64 times, also swapped - sha1_ctx_t ctx; + u32 dst_buf[16]; - sha1_init (&ctx); + u8 *dst_ptr = (u8 *) dst_buf; + + int tmp_idx = 0; for (int i = 0; i < 64; i++) { - tmp_buf[i] = pw_buf[idx++]; + for (int j = 0; j < pw_len; j++) + { + const int dst_idx = tmp_idx & 63; - if (idx >= pw_len) idx = 0; + dst_ptr[dst_idx] = src_ptr[j]; + + // write to global memory every time 64 byte are written into cache + + if (dst_idx == 63) + { + const int tmp_idx4 = (tmp_idx - 63) / 4; + + tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]); + tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]); + tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]); + tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]); + tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]); + tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]); + tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]); + tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]); + tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]); + tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]); + tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]); + tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]); + tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]); + tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]); + tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]); + tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]); + } + + tmp_idx++; + } } - sha1_update_swap (&ctx, buf, 64); + // hash - tmps[gid].idx = idx; - tmps[gid].ctx = ctx; + tmps[gid].h[0] = SHA1M_A; + tmps[gid].h[1] = SHA1M_B; + tmps[gid].h[2] = SHA1M_C; + tmps[gid].h[3] = SHA1M_D; + tmps[gid].h[4] = SHA1M_E; } KERNEL_FQ void m25200_loop (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) @@ -94,32 +137,96 @@ KERNEL_FQ void m25200_loop (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) if (gid >= gid_max) return; - const GLOBAL_AS u8 *pw_buf = (GLOBAL_AS u8 *) pws[gid].i; + u32 h[5]; + + h[0] = tmps[gid].h[0]; + h[1] = tmps[gid].h[1]; + h[2] = tmps[gid].h[2]; + h[3] = tmps[gid].h[3]; + h[4] = tmps[gid].h[4]; const u32 pw_len = pws[gid].pw_len; - u32 idx = tmps[gid].idx; + const int pw_len64 = pw_len * 64; - u32 buf[16] = { 0 }; + #define SNMPV3_TMP_ELEMS_OPT 1024 // 1024 = (64 max pw length * 64) / sizeof (u32) + // for pw length > 64 we use global memory reads - u8 *tmp_buf = (u8 *) buf; + u32 tmp[SNMPV3_TMP_ELEMS_OPT]; - sha1_ctx_t ctx = tmps[gid].ctx; - - for (u32 j = 0; j < loop_cnt; j++) + if (pw_len < 64) { - for (int i = 0; i < 64; i++) + for (int i = 0; i < pw_len64 / 4; i++) { - tmp_buf[i] = pw_buf[idx++]; - - if (idx >= pw_len) idx = 0; + tmp[i] = tmps[gid].tmp[i]; } - sha1_update_swap (&ctx, buf, 64); + for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64) + { + const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = tmp[idx + 0]; + w0[1] = tmp[idx + 1]; + w0[2] = tmp[idx + 2]; + w0[3] = tmp[idx + 3]; + w1[0] = tmp[idx + 4]; + w1[1] = tmp[idx + 5]; + w1[2] = tmp[idx + 6]; + w1[3] = tmp[idx + 7]; + w2[0] = tmp[idx + 8]; + w2[1] = tmp[idx + 9]; + w2[2] = tmp[idx + 10]; + w2[3] = tmp[idx + 11]; + w3[0] = tmp[idx + 12]; + w3[1] = tmp[idx + 13]; + w3[2] = tmp[idx + 14]; + w3[3] = tmp[idx + 15]; + + sha1_transform (w0, w1, w2, w3, h); + } + } + else + { + for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64) + { + const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this + + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; + + w0[0] = tmps[gid].tmp[idx + 0]; + w0[1] = tmps[gid].tmp[idx + 1]; + w0[2] = tmps[gid].tmp[idx + 2]; + w0[3] = tmps[gid].tmp[idx + 3]; + w1[0] = tmps[gid].tmp[idx + 4]; + w1[1] = tmps[gid].tmp[idx + 5]; + w1[2] = tmps[gid].tmp[idx + 6]; + w1[3] = tmps[gid].tmp[idx + 7]; + w2[0] = tmps[gid].tmp[idx + 8]; + w2[1] = tmps[gid].tmp[idx + 9]; + w2[2] = tmps[gid].tmp[idx + 10]; + w2[3] = tmps[gid].tmp[idx + 11]; + w3[0] = tmps[gid].tmp[idx + 12]; + w3[1] = tmps[gid].tmp[idx + 13]; + w3[2] = tmps[gid].tmp[idx + 14]; + w3[3] = tmps[gid].tmp[idx + 15]; + + sha1_transform (w0, w1, w2, w3, h); + } } - tmps[gid].idx = idx; - tmps[gid].ctx = ctx; + tmps[gid].h[0] = h[0]; + tmps[gid].h[1] = h[1]; + tmps[gid].h[2] = h[2]; + tmps[gid].h[3] = h[3]; + tmps[gid].h[4] = h[4]; } KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) @@ -132,64 +239,108 @@ KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) if (gid >= gid_max) return; - const GLOBAL_AS u8 *engineID_buf = esalt_bufs[DIGESTS_OFFSET].engineID_buf; + u32 w0[4]; + u32 w1[4]; + u32 w2[4]; + u32 w3[4]; - u32 engineID_len = esalt_bufs[DIGESTS_OFFSET].engineID_len; + w0[0] = 0x80000000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + w2[0] = 0; + w2[1] = 0; + w2[2] = 0; + w2[3] = 0; + w3[0] = 0; + w3[1] = 0; + w3[2] = 0; + w3[3] = 1048576 * 8; - sha1_ctx_t ctx = tmps[gid].ctx; + u32 h[5]; - sha1_final (&ctx); + h[0] = tmps[gid].h[0]; + h[1] = tmps[gid].h[1]; + h[2] = tmps[gid].h[2]; + h[3] = tmps[gid].h[3]; + h[4] = tmps[gid].h[4]; - const u32 h[5] = { - hc_swap32_S (ctx.h[0]), - hc_swap32_S (ctx.h[1]), - hc_swap32_S (ctx.h[2]), - hc_swap32_S (ctx.h[3]), - hc_swap32_S (ctx.h[4]) - }; + sha1_transform (w0, w1, w2, w3, h); - u32 tmp_buf[32] = { 0 }; - - tmp_buf[0] = h[0]; - tmp_buf[1] = h[1]; - tmp_buf[2] = h[2]; - tmp_buf[3] = h[3]; - tmp_buf[4] = h[4]; - - u8 *buf = (u8 *) (tmp_buf); - - u32 i = 20; - u32 j; - - for (j = 0; j < engineID_len; j++) - { - buf[i++] = engineID_buf[j]; - } - - for (j = 0; j < 20; j++) - { - buf[i++] = buf[j]; - } + sha1_ctx_t ctx; sha1_init (&ctx); - sha1_update_swap (&ctx, tmp_buf, i); + u32 w[16]; + + w[ 0] = h[0]; + w[ 1] = h[1]; + w[ 2] = h[2]; + w[ 3] = h[3]; + w[ 4] = h[4]; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + sha1_update (&ctx, w, 20); + + sha1_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len); + + w[ 0] = h[0]; + w[ 1] = h[1]; + w[ 2] = h[2]; + w[ 3] = h[3]; + w[ 4] = h[4]; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; + + sha1_update (&ctx, w, 20); sha1_final (&ctx); - u32 key[16] = { 0 }; - - key[0] = ctx.h[0]; - key[1] = ctx.h[1]; - key[2] = ctx.h[2]; - key[3] = ctx.h[3]; - key[4] = ctx.h[4]; + w[ 0] = ctx.h[0]; + w[ 1] = ctx.h[1]; + w[ 2] = ctx.h[2]; + w[ 3] = ctx.h[3]; + w[ 4] = ctx.h[4]; + w[ 5] = 0; + w[ 6] = 0; + w[ 7] = 0; + w[ 8] = 0; + w[ 9] = 0; + w[10] = 0; + w[11] = 0; + w[12] = 0; + w[13] = 0; + w[14] = 0; + w[15] = 0; sha1_hmac_ctx_t hmac_ctx; - sha1_hmac_init (&hmac_ctx, key, 20); + sha1_hmac_init (&hmac_ctx, w, 20); - sha1_hmac_update_global (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len); + sha1_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len); sha1_hmac_final (&hmac_ctx); @@ -204,3 +355,4 @@ KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t)) #include COMPARE_M #endif } + diff --git a/src/modules/module_25200.c b/src/modules/module_25200.c index 89617be24..c98a347ef 100644 --- a/src/modules/module_25200.c +++ b/src/modules/module_25200.c @@ -44,27 +44,35 @@ const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig, static const char *SIGNATURE_SNMPV3 = "$SNMPv3$2$"; -#define SNMPV3_SALT_MAX 1500 -#define SNMPV3_SALT_MAX_BIN 752 -#define SNMPV3_ENGINEID_MAX 32 -#define SNMPV3_MSG_AUTH_PARAMS_MAX 12 +#define SNMPV3_SALT_MAX 1500 +#define SNMPV3_ENGINEID_MAX 32 +#define SNMPV3_MSG_AUTH_PARAMS_MAX 12 +#define SNMPV3_ROUNDS 1048576 +#define SNMPV3_MAX_PW_LENGTH 64 + +#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32) +#define SNMPV3_HASH_ELEMS 8 // 8 = aligned 5 typedef struct hmac_sha1_tmp { - u32 idx; - sha1_ctx_t ctx; + u32 tmp[SNMPV3_TMP_ELEMS]; + u32 h[SNMPV3_HASH_ELEMS]; } hmac_sha1_tmp_t; +#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64 +#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64 +#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9 + typedef struct snmpv3 { - u32 salt_buf[SNMPV3_SALT_MAX_BIN]; + u32 salt_buf[SNMPV3_MAX_SALT_ELEMS]; u32 salt_len; - u8 engineID_buf[SNMPV3_ENGINEID_MAX]; + u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS]; u32 engineID_len; - u8 packet_number[8+1]; + u32 packet_number[SNMPV3_MAX_PNUM_ELEMS]; } snmpv3_t; @@ -82,6 +90,23 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } +u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + // we need to fix iteration count to guarantee the loop count is a multiple of 64 + // 2k calls to sha1_transform typically is enough to overtime pcie bottleneck + + const u32 kernel_loops_min = 2048 * 64; + + return kernel_loops_min; +} + +u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +{ + const u32 kernel_loops_max = 2048 * 64; + + return kernel_loops_max; +} + int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) { u32 *digest = (u32 *) digest_buf; @@ -146,12 +171,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE snmpv3->salt_len = hex_decode (salt_pos, salt_len, salt_ptr); - for (uint i = 0; i < snmpv3->salt_len / 2; i++) - { - snmpv3->salt_buf[i] = byte_swap_32 (snmpv3->salt_buf[i]); - } - - salt->salt_iter = 16384 - 1; + salt->salt_iter = SNMPV3_ROUNDS; // handle unique salts detection @@ -163,9 +183,9 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE // store sha1(snmpv3->salt_buf) in salt_buf - salt->salt_len = 20; + memcpy (salt->salt_buf, sha1_ctx.h, 20); - memcpy (salt->salt_buf, sha1_ctx.h, salt->salt_len); + salt->salt_len = 20; // engineid @@ -183,13 +203,12 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE digest[0] = hex_to_u32 (hash_pos + 0); digest[1] = hex_to_u32 (hash_pos + 8); digest[2] = hex_to_u32 (hash_pos + 16); + digest[3] = 0; digest[0] = byte_swap_32 (digest[0]); digest[1] = byte_swap_32 (digest[1]); digest[2] = byte_swap_32 (digest[2]); - digest[3] = 0; - return (PARSER_OK); } @@ -199,33 +218,35 @@ int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE snmpv3_t *snmpv3 = (snmpv3_t *) esalt_buf; - int line_len = snprintf (line_buf, 10 + strlen ((char *) snmpv3->packet_number) + 1 + 1, "%s%s$", SIGNATURE_SNMPV3, snmpv3->packet_number); + u8 *out_buf = (u8 *) line_buf; - uint i; + int out_len = snprintf (line_buf, line_size, "%s%s$", SIGNATURE_SNMPV3, (char *) snmpv3->packet_number); - u32 salt_buf_32[SNMPV3_SALT_MAX_BIN] = { 0 }; + out_len += hex_encode ((u8 *) snmpv3->salt_buf, snmpv3->salt_len, out_buf + out_len); - for (i = 0; i < SNMPV3_SALT_MAX_BIN; i++) salt_buf_32[i] = byte_swap_32 (snmpv3->salt_buf[i]); + out_buf[out_len] = '$'; - const u8 *salt_buf_ptr = (u8 *) salt_buf_32; + out_len++; - line_len += hex_encode (salt_buf_ptr, snmpv3->salt_len, (u8 *) line_buf+line_len); + out_len += hex_encode ((u8 *) snmpv3->engineID_buf, snmpv3->engineID_len, out_buf + out_len); - line_buf[line_len] = '$'; + out_buf[out_len] = '$'; - line_len++; + out_len++; - line_len += hex_encode (snmpv3->engineID_buf, snmpv3->engineID_len, (u8 *) line_buf+line_len); + u32 digest_tmp[3]; - line_buf[line_len] = '$'; + digest_tmp[0] = byte_swap_32 (digest[0]); + digest_tmp[1] = byte_swap_32 (digest[1]); + digest_tmp[2] = byte_swap_32 (digest[2]); - line_len++; + u32_to_hex (digest_tmp[0], out_buf + out_len); out_len += 8; + u32_to_hex (digest_tmp[1], out_buf + out_len); out_len += 8; + u32_to_hex (digest_tmp[2], out_buf + out_len); out_len += 8; - u32_to_hex (byte_swap_32 (digest[0]), (u8 *) line_buf+line_len); line_len += 8; - u32_to_hex (byte_swap_32 (digest[1]), (u8 *) line_buf+line_len); line_len += 8; - u32_to_hex (byte_swap_32 (digest[2]), (u8 *) line_buf+line_len); line_len += 8; + out_buf[out_len] = 0; - return line_len; + return out_len; } void module_init (module_ctx_t *module_ctx) @@ -277,8 +298,8 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_jit_cache_disable = MODULE_DEFAULT; module_ctx->module_kernel_accel_max = MODULE_DEFAULT; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; - module_ctx->module_kernel_loops_max = MODULE_DEFAULT; - module_ctx->module_kernel_loops_min = MODULE_DEFAULT; + module_ctx->module_kernel_loops_max = module_kernel_loops_max; + module_ctx->module_kernel_loops_min = module_kernel_loops_min; module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type;