diff --git a/OpenCL/m17010-pure.cl b/OpenCL/m17010-pure.cl index 4b1f2bcab..84c034777 100644 --- a/OpenCL/m17010-pure.cl +++ b/OpenCL/m17010-pure.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -#define NEW_SIMD_CODE +//#define NEW_SIMD_CODE #ifdef KERNEL_STATIC #include "inc_vendor.h" @@ -18,9 +18,9 @@ typedef struct gpg { u32 cipher_algo; u32 iv[4]; - u32 modulus_size; + u32 modulus_size; u32 encrypted_data[384]; - u32 encrypted_data_size; + u32 encrypted_data_size; } gpg_t; @@ -42,31 +42,58 @@ typedef struct gpg_tmp } gpg_tmp_t; -DECLSPEC void memcat_be_S (u32 *block, const u32 offset, const u32 *append, u32 len) + +DECLSPEC u32 hc_bytealign_le_S (const u32 a, const u32 b, const int c) +{ + const int c_mod_4 = c & 3; + + const u32 r = hc_byte_perm_S (b, a, (0x76543210 >> (c_mod_4 * 4)) & 0xffff); + + return r; +} + +DECLSPEC void memcat_le_S (u32 *block, const u32 offset, const u32 *append, u32 len) { const u32 start_index = (offset - 1) >> 2; const u32 count = ((offset + len + 3) >> 2) - start_index; const int off_mod_4 = offset & 3; const int off_minus_4 = 4 - off_mod_4; - block[start_index] |= hc_bytealign_be_S (append[0], 0, off_minus_4); + block[start_index] |= hc_bytealign_le_S (append[0], 0, off_minus_4); for (u32 idx = 1; idx < count; idx++) { - block[start_index + idx] = hc_bytealign_be_S (append[idx], append[idx - 1], off_minus_4); + block[start_index + idx] = hc_bytealign_le_S (append[idx], append[idx - 1], off_minus_4); } } -DECLSPEC void memzero_be_S (u32 *block, const u32 start_offset, const u32 end_offset) +DECLSPEC void memzero_le_S (u32 *block, const u32 start_offset, const u32 end_offset) { - const u32 start_idx = (start_offset + 3) / 4; + const u32 start_idx = start_offset / 4; + + // zero out bytes in the first u32 starting from 'start_offset' + block[start_idx] &= 0xffffffff >> ((4 - (start_offset & 3)) * 8); + const u32 end_idx = (end_offset + 3) / 4; + // zero out bytes in u32 units -- note that the last u32 is completely zeroed! + for (u32 i = start_idx + 1; i < end_idx; i++) + { + block[i] = 0; + } +} + +DECLSPEC void memzero_be_S (u32 *block, const u32 start_offset, const u32 end_offset) +{ + const u32 start_idx = start_offset / 4; + // zero out bytes in the first u32 starting from 'start_offset' - block[start_idx - 1] &= 0xffffffff >> (((4 - start_offset) & 3) * 8); + block[start_idx] &= 0xffffffff << ((4 - (start_offset & 3)) * 8); + + const u32 end_idx = (end_offset + 3) / 4; // zero out bytes in u32 units -- note that the last u32 is completely zeroed! - for (u32 i = start_idx; i < end_idx; i++) + for (u32 i = start_idx + 1; i < end_idx; i++) { block[i] = 0; } @@ -143,13 +170,14 @@ DECLSPEC int check_decoded_data (u32 *decoded_data, const u32 decoded_data_size) const u32 sha1_u32_off = sha1_byte_off / 4; u32 expected_sha1[5]; - expected_sha1[0] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off); - expected_sha1[1] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off); - expected_sha1[2] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off); - expected_sha1[3] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off); - expected_sha1[4] = hc_bytealign_be_S (decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off); - memzero_be_S (decoded_data, sha1_byte_off, 384 * sizeof(u32)); + expected_sha1[0] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off); + expected_sha1[1] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off); + expected_sha1[2] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off); + expected_sha1[3] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off); + expected_sha1[4] = hc_bytealign_le_S (decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off); + + memzero_le_S (decoded_data, sha1_byte_off, 384 * sizeof(u32)); sha1_ctx_t ctx; @@ -188,14 +216,31 @@ KERNEL_FQ void m17010_init (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) // create a number of copies for efficiency const u32 copies = 80 * sizeof(u32) / salted_pw_len; + for (u32 idx = 1; idx < copies; idx++) { - memcat_be_S (salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len); + memcat_le_S (salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len); } - for (u32 idx = 0; idx < 80; idx++) tmps[gid].salted_pw_block[idx] = salted_pw_block[idx]; + for (u32 idx = 0; idx < 80; idx++) + { + tmps[gid].salted_pw_block[idx] = hc_swap32_S (salted_pw_block[idx]); + } tmps[gid].salted_pw_block_len = (copies * salted_pw_len); + + tmps[gid].h[0] = SHA1M_A; + tmps[gid].h[1] = SHA1M_B; + tmps[gid].h[2] = SHA1M_C; + tmps[gid].h[3] = SHA1M_D; + tmps[gid].h[4] = SHA1M_E; + tmps[gid].h[5] = SHA1M_A; + tmps[gid].h[6] = SHA1M_B; + tmps[gid].h[7] = SHA1M_C; + tmps[gid].h[8] = SHA1M_D; + tmps[gid].h[9] = SHA1M_E; + + tmps[gid].len = 0; } KERNEL_FQ void m17010_loop_prepare (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) @@ -204,31 +249,24 @@ KERNEL_FQ void m17010_loop_prepare (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) if (gid >= gid_max) return; - /** - * context save - */ - - sha1_ctx_t ctx; - - sha1_init (&ctx); - - // padd with one or more zeroes for larger target key sizes, e.g. for AES-256 - if (salt_repeat > 0) - { - u32 zeroes[16] = {0}; - - sha1_update (&ctx, zeroes, salt_repeat); - } - - const u32 sha_offset = salt_repeat * 5; - - for (int i = 0; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i]; - for (int i = 0; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i]; - for (int i = 0; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i]; - for (int i = 0; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i]; - for (int i = 0; i < 4; i++) tmps[gid].w3[i] = ctx.w3[i]; - - tmps[gid].len = ctx.len; + tmps[gid].w0[0] = 0; + tmps[gid].w0[1] = 0; + tmps[gid].w0[2] = 0; + tmps[gid].w0[3] = 0; + tmps[gid].w1[0] = 0; + tmps[gid].w1[1] = 0; + tmps[gid].w1[2] = 0; + tmps[gid].w1[3] = 0; + tmps[gid].w2[0] = 0; + tmps[gid].w2[1] = 0; + tmps[gid].w2[2] = 0; + tmps[gid].w2[3] = 0; + tmps[gid].w3[0] = 0; + tmps[gid].w3[1] = 0; + tmps[gid].w3[2] = 0; + tmps[gid].w3[3] = 0; + + tmps[gid].len = salt_repeat; } KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) @@ -236,12 +274,15 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) const u64 gid = get_global_id (0); if (gid >= gid_max) return; - + // get the prepared buffer from the gpg_tmp_t struct into a local buffer u32 salted_pw_block[80]; + for (int i = 0; i < 80; i++) salted_pw_block[i] = tmps[gid].salted_pw_block[i]; const u32 salted_pw_block_len = tmps[gid].salted_pw_block_len; + + // do we really need this, since the salt is always length 8? if (salted_pw_block_len == 0) return; /** @@ -253,6 +294,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) const u32 sha_offset = salt_repeat * 5; for (int i = 0; i < 5; i++) ctx.h[i] = tmps[gid].h[sha_offset + i]; + for (int i = 0; i < 4; i++) ctx.w0[i] = tmps[gid].w0[i]; for (int i = 0; i < 4; i++) ctx.w1[i] = tmps[gid].w1[i]; for (int i = 0; i < 4; i++) ctx.w2[i] = tmps[gid].w2[i]; @@ -268,7 +310,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) for (u32 i = 0; i < rounds; i++) { - sha1_update_swap (&ctx, salted_pw_block, salted_pw_block_len); + sha1_update (&ctx, salted_pw_block, salted_pw_block_len); } if ((loop_pos + loop_cnt) == salt_iter) @@ -279,7 +321,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) { memzero_be_S (salted_pw_block, remaining_bytes, salted_pw_block_len); - sha1_update_swap (&ctx, salted_pw_block, remaining_bytes); + sha1_update (&ctx, salted_pw_block, remaining_bytes); } sha1_final (&ctx); @@ -290,6 +332,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t)) */ for (int i = 0; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i]; + for (int i = 0; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i]; for (int i = 0; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i]; for (int i = 0; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i]; diff --git a/src/modules/module_17010.c b/src/modules/module_17010.c index fca881c75..31012f091 100644 --- a/src/modules/module_17010.c +++ b/src/modules/module_17010.c @@ -19,8 +19,7 @@ static const u32 DGST_SIZE = DGST_SIZE_4_4; static const u32 HASH_CATEGORY = HASH_CATEGORY_RAW_HASH; static const char *HASH_NAME = "GPG (AES-128/AES-256 (SHA-1($pass)))"; static const u64 KERN_TYPE = 17010; -static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE - | OPTI_TYPE_SLOW_HASH_SIMD_LOOP; +static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE | OPTS_TYPE_LOOP_PREPARE | OPTS_TYPE_AUX1