Store precomputed KE for -m 22100 in shared memory and lock the loops per kernel invocation to a fixed value

pull/2259/head
Jens Steube 5 years ago
parent db5decb750
commit 311d363054

@ -25,7 +25,7 @@ typedef struct bitlocker
u32 type;
u32 iv[4];
u32 data[15];
u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed
u32 wb_ke_pc[ITERATION_BITLOCKER][48];
} bitlocker_t;
@ -36,7 +36,13 @@ typedef struct bitlocker_tmp
} bitlocker_tmp_t;
DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, const GLOBAL_AS u32 wb_ke_pc[64])
#ifdef REAL_SHM
#define SHM_TYPE2 LOCAL_AS
#else
#define SHM_TYPE2 GLOBAL_AS
#endif
DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const u32x *w2, const u32x *w3, u32x *digest, SHM_TYPE2 u32 s_wb_ke_pc[48])
{
u32x a = digest[0];
u32x b = digest[1];
@ -64,24 +70,24 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const
u32x we_t = w3[2];
u32x wf_t = w3[3];
#define ROUND_EXPAND_PC(i) \
{ \
w0_t = wb_ke_pc[i + 0]; \
w1_t = wb_ke_pc[i + 1]; \
w2_t = wb_ke_pc[i + 2]; \
w3_t = wb_ke_pc[i + 3]; \
w4_t = wb_ke_pc[i + 4]; \
w5_t = wb_ke_pc[i + 5]; \
w6_t = wb_ke_pc[i + 6]; \
w7_t = wb_ke_pc[i + 7]; \
w8_t = wb_ke_pc[i + 8]; \
w9_t = wb_ke_pc[i + 9]; \
wa_t = wb_ke_pc[i + 10]; \
wb_t = wb_ke_pc[i + 11]; \
wc_t = wb_ke_pc[i + 12]; \
wd_t = wb_ke_pc[i + 13]; \
we_t = wb_ke_pc[i + 14]; \
wf_t = wb_ke_pc[i + 15]; \
#define ROUND_EXPAND_PC(i) \
{ \
w0_t = s_wb_ke_pc[i + 0]; \
w1_t = s_wb_ke_pc[i + 1]; \
w2_t = s_wb_ke_pc[i + 2]; \
w3_t = s_wb_ke_pc[i + 3]; \
w4_t = s_wb_ke_pc[i + 4]; \
w5_t = s_wb_ke_pc[i + 5]; \
w6_t = s_wb_ke_pc[i + 6]; \
w7_t = s_wb_ke_pc[i + 7]; \
w8_t = s_wb_ke_pc[i + 8]; \
w9_t = s_wb_ke_pc[i + 9]; \
wa_t = s_wb_ke_pc[i + 10]; \
wb_t = s_wb_ke_pc[i + 11]; \
wc_t = s_wb_ke_pc[i + 12]; \
wd_t = s_wb_ke_pc[i + 13]; \
we_t = s_wb_ke_pc[i + 14]; \
wf_t = s_wb_ke_pc[i + 15]; \
}
#define ROUND_STEP(i) \
@ -104,12 +110,14 @@ DECLSPEC void sha256_transform_vector_pc (const u32x *w0, const u32x *w1, const
SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
}
ROUND_STEP (0);
#ifdef _unroll
#pragma unroll
#endif
for (int i = 0; i < 64; i += 16)
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND_PC (i); ROUND_STEP (i);
ROUND_EXPAND_PC (i - 16); ROUND_STEP (i);
}
#undef ROUND_EXPAND_PC
@ -188,9 +196,60 @@ KERNEL_FQ void m22100_init (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
{
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
const u64 lsz = get_local_size (0);
/**
* load 256 full w[] precomputed KE buffers into shared memory since its all static data
* in order for this to work we need to set a fixed loop count to 256
*/
#ifdef REAL_SHM
LOCAL_VK u32 s_wb_ke_pc[256][48];
for (u32 i = lid; i < 256; i += lsz)
{
for (u32 j = 0; j < 48; j++) // first 16 set to register
{
s_wb_ke_pc[i][j] = esalt_bufs[digests_offset].wb_ke_pc[loop_pos + i][j];
}
}
SYNC_THREADS ();
#else
GLOBAL_AS u32 (*s_wb_ke_pc)[48] = &esalt_bufs[digests_offset].wb_ke_pc[loop_pos];
#endif
if ((gid * VECT_SIZE) >= gid_max) return;
// salt to register
u32x t0[4];
u32x t1[4];
u32x t2[4];
u32x t3[4];
t0[0] = salt_bufs[salt_pos].salt_buf[0];
t0[1] = salt_bufs[salt_pos].salt_buf[1];
t0[2] = salt_bufs[salt_pos].salt_buf[2];
t0[3] = salt_bufs[salt_pos].salt_buf[3];
t1[0] = 0;
t1[1] = 0;
t1[2] = 0x80000000;
t1[3] = 0;
t2[0] = 0;
t2[1] = 0;
t2[2] = 0;
t2[3] = 0;
t3[0] = 0;
t3[1] = 0;
t3[2] = 0;
t3[3] = 88 * 8;
// init
u32x w0[4];
@ -230,8 +289,11 @@ KERNEL_FQ void m22100_loop (KERN_ATTR_TMPS_ESALT (bitlocker_tmp_t, bitlocker_t))
digest[6] = SHA256M_G;
digest[7] = SHA256M_H;
sha256_transform_vector (w0, w1, w2, w3, digest);
sha256_transform_vector_pc (w0, w1, w2, w3, digest, esalt_bufs[digests_offset].wb_ke_pc[j]);
sha256_transform_vector (w0, w1, w2, w3, digest);
t1[0] = hc_swap32_S (j); // only moving part
sha256_transform_vector_pc (t0, t1, t2, t3, digest, s_wb_ke_pc[i]);
w0[0] = digest[0];
w0[1] = digest[1];

@ -51,7 +51,7 @@ typedef struct bitlocker
u32 type;
u32 iv[4];
u32 data[15];
u32 wb_ke_pc[ITERATION_BITLOCKER][64]; // only 48 needed
u32 wb_ke_pc[ITERATION_BITLOCKER][48];
} bitlocker_t;
@ -78,11 +78,18 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
return tmp_size;
}
u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
u32 module_kernel_loops_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 256;
const u32 kernel_loops_min = 256;
return kernel_threads_max;
return kernel_loops_min;
}
u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_loops_max = 256;
return kernel_loops_max;
}
u32 module_pw_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
@ -210,34 +217,35 @@ int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSE
for (int i = 0; i < ITERATION_BITLOCKER; i++)
{
bitlocker->wb_ke_pc[i][ 0] = salt->salt_buf[0];
bitlocker->wb_ke_pc[i][ 1] = salt->salt_buf[1];
bitlocker->wb_ke_pc[i][ 2] = salt->salt_buf[2];
bitlocker->wb_ke_pc[i][ 3] = salt->salt_buf[3];
bitlocker->wb_ke_pc[i][ 4] = byte_swap_32 (i);
bitlocker->wb_ke_pc[i][ 5] = 0;
bitlocker->wb_ke_pc[i][ 6] = 0x80000000;
bitlocker->wb_ke_pc[i][ 7] = 0;
bitlocker->wb_ke_pc[i][ 8] = 0;
bitlocker->wb_ke_pc[i][ 9] = 0;
bitlocker->wb_ke_pc[i][10] = 0;
bitlocker->wb_ke_pc[i][11] = 0;
bitlocker->wb_ke_pc[i][12] = 0;
bitlocker->wb_ke_pc[i][13] = 0;
bitlocker->wb_ke_pc[i][14] = 0;
bitlocker->wb_ke_pc[i][15] = 88 * 8;
u32 tmp[64];
tmp[ 0] = salt->salt_buf[0];
tmp[ 1] = salt->salt_buf[1];
tmp[ 2] = salt->salt_buf[2];
tmp[ 3] = salt->salt_buf[3];
tmp[ 4] = byte_swap_32 (i);
tmp[ 5] = 0;
tmp[ 6] = 0x80000000;
tmp[ 7] = 0;
tmp[ 8] = 0;
tmp[ 9] = 0;
tmp[10] = 0;
tmp[11] = 0;
tmp[12] = 0;
tmp[13] = 0;
tmp[14] = 0;
tmp[15] = 88 * 8;
#define hc_rotl32_S rotl32
for (int j = 16; j < 64; j++)
{
bitlocker->wb_ke_pc[i][j] = SHA256_EXPAND_S
(
bitlocker->wb_ke_pc[i][j - 2],
bitlocker->wb_ke_pc[i][j - 7],
bitlocker->wb_ke_pc[i][j - 15],
bitlocker->wb_ke_pc[i][j - 16]
);
tmp[j] = SHA256_EXPAND_S (tmp[j - 2], tmp[j - 7], tmp[j - 15], tmp[j - 16]);
}
for (int j = 0; j < 48; j++)
{
bitlocker->wb_ke_pc[i][j] = tmp[16 + j];
}
}
@ -423,9 +431,9 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_jit_cache_disable = MODULE_DEFAULT;
module_ctx->module_kernel_accel_max = MODULE_DEFAULT;
module_ctx->module_kernel_accel_min = MODULE_DEFAULT;
module_ctx->module_kernel_loops_max = MODULE_DEFAULT;
module_ctx->module_kernel_loops_min = MODULE_DEFAULT;
module_ctx->module_kernel_threads_max = module_kernel_threads_max;
module_ctx->module_kernel_loops_max = module_kernel_loops_max;
module_ctx->module_kernel_loops_min = module_kernel_loops_min;
module_ctx->module_kernel_threads_max = MODULE_DEFAULT;
module_ctx->module_kernel_threads_min = MODULE_DEFAULT;
module_ctx->module_kern_type = module_kern_type;
module_ctx->module_kern_type_dynamic = MODULE_DEFAULT;

Loading…
Cancel
Save