1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-22 16:18:09 +00:00

Optimize and clean up -m 25200 code.

Speed on CPU: 1521 -> 2066 (35% faster)
Speed on GPU: 16610 -> 157754 (9.4 times faster)
Works for all password lengths
This commit is contained in:
Jens Steube 2021-07-24 11:58:36 +02:00
parent 5e0f7ecf00
commit 3a31b669b5
2 changed files with 288 additions and 115 deletions

View File

@ -17,25 +17,35 @@
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define SNMPV3_ENGINEID_MAX 32
#define SNMPV3_SALT_MAX 752
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 32
#define SNMPV3_MSG_AUTH_PARAMS_MAX 12
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8 // 8 = aligned 5
typedef struct hmac_sha1_tmp
{
u32 idx;
sha1_ctx_t ctx;
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_sha1_tmp_t;
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_SALT_MAX];
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u8 engineID_buf[SNMPV3_ENGINEID_MAX];
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u8 packet_number[8+1];
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
@ -53,35 +63,68 @@ KERNEL_FQ void m25200_init (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
* base
*/
const GLOBAL_AS u8 *pw_buf = (GLOBAL_AS u8 *) pws[gid].i;
const u32 pw_len = pws[gid].pw_len;
/**
* authkey
*/
u32 w[64] = { 0 };
u32 idx = 0;
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u32 buf[16] = { 0 };
u8 *src_ptr = (u8 *) w;
u8 *tmp_buf = (u8 *) buf;
// password 64 times, also swapped
sha1_ctx_t ctx;
u32 dst_buf[16];
sha1_init (&ctx);
u8 *dst_ptr = (u8 *) dst_buf;
int tmp_idx = 0;
for (int i = 0; i < 64; i++)
{
tmp_buf[i] = pw_buf[idx++];
for (int j = 0; j < pw_len; j++)
{
const int dst_idx = tmp_idx & 63;
if (idx >= pw_len) idx = 0;
dst_ptr[dst_idx] = src_ptr[j];
// write to global memory every time 64 byte are written into cache
if (dst_idx == 63)
{
const int tmp_idx4 = (tmp_idx - 63) / 4;
tmps[gid].tmp[tmp_idx4 + 0] = hc_swap32_S (dst_buf[ 0]);
tmps[gid].tmp[tmp_idx4 + 1] = hc_swap32_S (dst_buf[ 1]);
tmps[gid].tmp[tmp_idx4 + 2] = hc_swap32_S (dst_buf[ 2]);
tmps[gid].tmp[tmp_idx4 + 3] = hc_swap32_S (dst_buf[ 3]);
tmps[gid].tmp[tmp_idx4 + 4] = hc_swap32_S (dst_buf[ 4]);
tmps[gid].tmp[tmp_idx4 + 5] = hc_swap32_S (dst_buf[ 5]);
tmps[gid].tmp[tmp_idx4 + 6] = hc_swap32_S (dst_buf[ 6]);
tmps[gid].tmp[tmp_idx4 + 7] = hc_swap32_S (dst_buf[ 7]);
tmps[gid].tmp[tmp_idx4 + 8] = hc_swap32_S (dst_buf[ 8]);
tmps[gid].tmp[tmp_idx4 + 9] = hc_swap32_S (dst_buf[ 9]);
tmps[gid].tmp[tmp_idx4 + 10] = hc_swap32_S (dst_buf[10]);
tmps[gid].tmp[tmp_idx4 + 11] = hc_swap32_S (dst_buf[11]);
tmps[gid].tmp[tmp_idx4 + 12] = hc_swap32_S (dst_buf[12]);
tmps[gid].tmp[tmp_idx4 + 13] = hc_swap32_S (dst_buf[13]);
tmps[gid].tmp[tmp_idx4 + 14] = hc_swap32_S (dst_buf[14]);
tmps[gid].tmp[tmp_idx4 + 15] = hc_swap32_S (dst_buf[15]);
}
tmp_idx++;
}
}
sha1_update_swap (&ctx, buf, 64);
// hash
tmps[gid].idx = idx;
tmps[gid].ctx = ctx;
tmps[gid].h[0] = SHA1M_A;
tmps[gid].h[1] = SHA1M_B;
tmps[gid].h[2] = SHA1M_C;
tmps[gid].h[3] = SHA1M_D;
tmps[gid].h[4] = SHA1M_E;
}
KERNEL_FQ void m25200_loop (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
@ -94,32 +137,96 @@ KERNEL_FQ void m25200_loop (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
if (gid >= gid_max) return;
const GLOBAL_AS u8 *pw_buf = (GLOBAL_AS u8 *) pws[gid].i;
u32 h[5];
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
const u32 pw_len = pws[gid].pw_len;
u32 idx = tmps[gid].idx;
const int pw_len64 = pw_len * 64;
u32 buf[16] = { 0 };
#define SNMPV3_TMP_ELEMS_OPT 1024 // 1024 = (64 max pw length * 64) / sizeof (u32)
// for pw length > 64 we use global memory reads
u8 *tmp_buf = (u8 *) buf;
u32 tmp[SNMPV3_TMP_ELEMS_OPT];
sha1_ctx_t ctx = tmps[gid].ctx;
for (u32 j = 0; j < loop_cnt; j++)
if (pw_len < 64)
{
for (int i = 0; i < 64; i++)
for (int i = 0; i < pw_len64 / 4; i++)
{
tmp_buf[i] = pw_buf[idx++];
if (idx >= pw_len) idx = 0;
tmp[i] = tmps[gid].tmp[i];
}
sha1_update_swap (&ctx, buf, 64);
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmp[idx + 0];
w0[1] = tmp[idx + 1];
w0[2] = tmp[idx + 2];
w0[3] = tmp[idx + 3];
w1[0] = tmp[idx + 4];
w1[1] = tmp[idx + 5];
w1[2] = tmp[idx + 6];
w1[3] = tmp[idx + 7];
w2[0] = tmp[idx + 8];
w2[1] = tmp[idx + 9];
w2[2] = tmp[idx + 10];
w2[3] = tmp[idx + 11];
w3[0] = tmp[idx + 12];
w3[1] = tmp[idx + 13];
w3[2] = tmp[idx + 14];
w3[3] = tmp[idx + 15];
sha1_transform (w0, w1, w2, w3, h);
}
}
else
{
for (int i = 0, j = loop_pos; i < loop_cnt; i += 64, j += 64)
{
const int idx = (j % pw_len64) / 4; // the optimization trick is to be able to do this
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
w0[0] = tmps[gid].tmp[idx + 0];
w0[1] = tmps[gid].tmp[idx + 1];
w0[2] = tmps[gid].tmp[idx + 2];
w0[3] = tmps[gid].tmp[idx + 3];
w1[0] = tmps[gid].tmp[idx + 4];
w1[1] = tmps[gid].tmp[idx + 5];
w1[2] = tmps[gid].tmp[idx + 6];
w1[3] = tmps[gid].tmp[idx + 7];
w2[0] = tmps[gid].tmp[idx + 8];
w2[1] = tmps[gid].tmp[idx + 9];
w2[2] = tmps[gid].tmp[idx + 10];
w2[3] = tmps[gid].tmp[idx + 11];
w3[0] = tmps[gid].tmp[idx + 12];
w3[1] = tmps[gid].tmp[idx + 13];
w3[2] = tmps[gid].tmp[idx + 14];
w3[3] = tmps[gid].tmp[idx + 15];
sha1_transform (w0, w1, w2, w3, h);
}
}
tmps[gid].idx = idx;
tmps[gid].ctx = ctx;
tmps[gid].h[0] = h[0];
tmps[gid].h[1] = h[1];
tmps[gid].h[2] = h[2];
tmps[gid].h[3] = h[3];
tmps[gid].h[4] = h[4];
}
KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
@ -132,64 +239,108 @@ KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
if (gid >= gid_max) return;
const GLOBAL_AS u8 *engineID_buf = esalt_bufs[DIGESTS_OFFSET].engineID_buf;
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 engineID_len = esalt_bufs[DIGESTS_OFFSET].engineID_len;
w0[0] = 0x80000000;
w0[1] = 0;
w0[2] = 0;
w0[3] = 0;
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 1048576 * 8;
sha1_ctx_t ctx = tmps[gid].ctx;
u32 h[5];
sha1_final (&ctx);
h[0] = tmps[gid].h[0];
h[1] = tmps[gid].h[1];
h[2] = tmps[gid].h[2];
h[3] = tmps[gid].h[3];
h[4] = tmps[gid].h[4];
const u32 h[5] = {
hc_swap32_S (ctx.h[0]),
hc_swap32_S (ctx.h[1]),
hc_swap32_S (ctx.h[2]),
hc_swap32_S (ctx.h[3]),
hc_swap32_S (ctx.h[4])
};
sha1_transform (w0, w1, w2, w3, h);
u32 tmp_buf[32] = { 0 };
tmp_buf[0] = h[0];
tmp_buf[1] = h[1];
tmp_buf[2] = h[2];
tmp_buf[3] = h[3];
tmp_buf[4] = h[4];
u8 *buf = (u8 *) (tmp_buf);
u32 i = 20;
u32 j;
for (j = 0; j < engineID_len; j++)
{
buf[i++] = engineID_buf[j];
}
for (j = 0; j < 20; j++)
{
buf[i++] = buf[j];
}
sha1_ctx_t ctx;
sha1_init (&ctx);
sha1_update_swap (&ctx, tmp_buf, i);
u32 w[16];
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&ctx, w, 20);
sha1_update_global_swap (&ctx, esalt_bufs[DIGESTS_OFFSET].engineID_buf, esalt_bufs[DIGESTS_OFFSET].engineID_len);
w[ 0] = h[0];
w[ 1] = h[1];
w[ 2] = h[2];
w[ 3] = h[3];
w[ 4] = h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_update (&ctx, w, 20);
sha1_final (&ctx);
u32 key[16] = { 0 };
key[0] = ctx.h[0];
key[1] = ctx.h[1];
key[2] = ctx.h[2];
key[3] = ctx.h[3];
key[4] = ctx.h[4];
w[ 0] = ctx.h[0];
w[ 1] = ctx.h[1];
w[ 2] = ctx.h[2];
w[ 3] = ctx.h[3];
w[ 4] = ctx.h[4];
w[ 5] = 0;
w[ 6] = 0;
w[ 7] = 0;
w[ 8] = 0;
w[ 9] = 0;
w[10] = 0;
w[11] = 0;
w[12] = 0;
w[13] = 0;
w[14] = 0;
w[15] = 0;
sha1_hmac_ctx_t hmac_ctx;
sha1_hmac_init (&hmac_ctx, key, 20);
sha1_hmac_init (&hmac_ctx, w, 20);
sha1_hmac_update_global (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha1_hmac_update_global_swap (&hmac_ctx, esalt_bufs[DIGESTS_OFFSET].salt_buf, esalt_bufs[DIGESTS_OFFSET].salt_len);
sha1_hmac_final (&hmac_ctx);
@ -204,3 +355,4 @@ KERNEL_FQ void m25200_comp (KERN_ATTR_TMPS_ESALT (hmac_sha1_tmp_t, snmpv3_t))
#include COMPARE_M
#endif
}

View File

@ -44,27 +44,35 @@ const char *module_st_pass (MAYBE_UNUSED const hashconfig_t *hashconfig,
static const char *SIGNATURE_SNMPV3 = "$SNMPv3$2$";
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_SALT_MAX_BIN 752
#define SNMPV3_ENGINEID_MAX 32
#define SNMPV3_MSG_AUTH_PARAMS_MAX 12
#define SNMPV3_SALT_MAX 1500
#define SNMPV3_ENGINEID_MAX 32
#define SNMPV3_MSG_AUTH_PARAMS_MAX 12
#define SNMPV3_ROUNDS 1048576
#define SNMPV3_MAX_PW_LENGTH 64
#define SNMPV3_TMP_ELEMS 4096 // 4096 = (256 (max pw length) * 64) / sizeof (u32)
#define SNMPV3_HASH_ELEMS 8 // 8 = aligned 5
typedef struct hmac_sha1_tmp
{
u32 idx;
sha1_ctx_t ctx;
u32 tmp[SNMPV3_TMP_ELEMS];
u32 h[SNMPV3_HASH_ELEMS];
} hmac_sha1_tmp_t;
#define SNMPV3_MAX_SALT_ELEMS 512 // 512 * 4 = 2048 > 1500, also has to be multiple of 64
#define SNMPV3_MAX_ENGINE_ELEMS 16 // 16 * 4 = 64 > 32, also has to be multiple of 64
#define SNMPV3_MAX_PNUM_ELEMS 4 // 4 * 4 = 16 > 9
typedef struct snmpv3
{
u32 salt_buf[SNMPV3_SALT_MAX_BIN];
u32 salt_buf[SNMPV3_MAX_SALT_ELEMS];
u32 salt_len;
u8 engineID_buf[SNMPV3_ENGINEID_MAX];
u32 engineID_buf[SNMPV3_MAX_ENGINE_ELEMS];
u32 engineID_len;
u8 packet_number[8+1];
u32 packet_number[SNMPV3_MAX_PNUM_ELEMS];
} snmpv3_t;
@ -82,6 +90,23 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
return tmp_size;
}
u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
// we need to fix iteration count to guarantee the loop count is a multiple of 64
// 2k calls to sha1_transform typically is enough to overtime pcie bottleneck
const u32 kernel_loops_min = 2048 * 64;
return kernel_loops_min;
}
u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_loops_max = 2048 * 64;
return kernel_loops_max;
}
int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
{
u32 *digest = (u32 *) digest_buf;
@ -146,12 +171,7 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
snmpv3->salt_len = hex_decode (salt_pos, salt_len, salt_ptr);
for (uint i = 0; i < snmpv3->salt_len / 2; i++)
{
snmpv3->salt_buf[i] = byte_swap_32 (snmpv3->salt_buf[i]);
}
salt->salt_iter = 16384 - 1;
salt->salt_iter = SNMPV3_ROUNDS;
// handle unique salts detection
@ -163,9 +183,9 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
// store sha1(snmpv3->salt_buf) in salt_buf
salt->salt_len = 20;
memcpy (salt->salt_buf, sha1_ctx.h, 20);
memcpy (salt->salt_buf, sha1_ctx.h, salt->salt_len);
salt->salt_len = 20;
// engineid
@ -183,13 +203,12 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
digest[0] = hex_to_u32 (hash_pos + 0);
digest[1] = hex_to_u32 (hash_pos + 8);
digest[2] = hex_to_u32 (hash_pos + 16);
digest[3] = 0;
digest[0] = byte_swap_32 (digest[0]);
digest[1] = byte_swap_32 (digest[1]);
digest[2] = byte_swap_32 (digest[2]);
digest[3] = 0;
return (PARSER_OK);
}
@ -199,33 +218,35 @@ int module_hash_encode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
snmpv3_t *snmpv3 = (snmpv3_t *) esalt_buf;
int line_len = snprintf (line_buf, 10 + strlen ((char *) snmpv3->packet_number) + 1 + 1, "%s%s$", SIGNATURE_SNMPV3, snmpv3->packet_number);
u8 *out_buf = (u8 *) line_buf;
uint i;
int out_len = snprintf (line_buf, line_size, "%s%s$", SIGNATURE_SNMPV3, (char *) snmpv3->packet_number);
u32 salt_buf_32[SNMPV3_SALT_MAX_BIN] = { 0 };
out_len += hex_encode ((u8 *) snmpv3->salt_buf, snmpv3->salt_len, out_buf + out_len);
for (i = 0; i < SNMPV3_SALT_MAX_BIN; i++) salt_buf_32[i] = byte_swap_32 (snmpv3->salt_buf[i]);
out_buf[out_len] = '$';
const u8 *salt_buf_ptr = (u8 *) salt_buf_32;
out_len++;
line_len += hex_encode (salt_buf_ptr, snmpv3->salt_len, (u8 *) line_buf+line_len);
out_len += hex_encode ((u8 *) snmpv3->engineID_buf, snmpv3->engineID_len, out_buf + out_len);
line_buf[line_len] = '$';
out_buf[out_len] = '$';
line_len++;
out_len++;
line_len += hex_encode (snmpv3->engineID_buf, snmpv3->engineID_len, (u8 *) line_buf+line_len);
u32 digest_tmp[3];
line_buf[line_len] = '$';
digest_tmp[0] = byte_swap_32 (digest[0]);
digest_tmp[1] = byte_swap_32 (digest[1]);
digest_tmp[2] = byte_swap_32 (digest[2]);
line_len++;
u32_to_hex (digest_tmp[0], out_buf + out_len); out_len += 8;
u32_to_hex (digest_tmp[1], out_buf + out_len); out_len += 8;
u32_to_hex (digest_tmp[2], out_buf + out_len); out_len += 8;
u32_to_hex (byte_swap_32 (digest[0]), (u8 *) line_buf+line_len); line_len += 8;
u32_to_hex (byte_swap_32 (digest[1]), (u8 *) line_buf+line_len); line_len += 8;
u32_to_hex (byte_swap_32 (digest[2]), (u8 *) line_buf+line_len); line_len += 8;
out_buf[out_len] = 0;
return line_len;
return out_len;
}
void module_init (module_ctx_t *module_ctx)
@ -277,8 +298,8 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_jit_cache_disable = MODULE_DEFAULT;
module_ctx->module_kernel_accel_max = MODULE_DEFAULT;
module_ctx->module_kernel_accel_min = MODULE_DEFAULT;
module_ctx->module_kernel_loops_max = MODULE_DEFAULT;
module_ctx->module_kernel_loops_min = MODULE_DEFAULT;
module_ctx->module_kernel_loops_max = module_kernel_loops_max;
module_ctx->module_kernel_loops_min = module_kernel_loops_min;
module_ctx->module_kernel_threads_max = MODULE_DEFAULT;
module_ctx->module_kernel_threads_min = MODULE_DEFAULT;
module_ctx->module_kern_type = module_kern_type;