@ -3,7 +3,7 @@
* License.....: MIT
*/
# define NEW_SIMD_CODE
//# define NEW_SIMD_CODE
# ifdef KERNEL_STATIC
# include "inc_vendor.h"
@ -18,9 +18,9 @@ typedef struct gpg
{
u32 cipher_algo ;
u32 iv[4] ;
u32 modulus_size ;
u32 modulus_size ;
u32 encrypted_data[384] ;
u32 encrypted_data_size ;
u32 encrypted_data_size ;
} gpg_t ;
@ -42,31 +42,58 @@ typedef struct gpg_tmp
} gpg_tmp_t ;
DECLSPEC void memcat_be_S ( u32 *block, const u32 offset, const u32 *append, u32 len )
DECLSPEC u32 hc_bytealign_le_S ( const u32 a, const u32 b, const int c )
{
const int c_mod_4 = c & 3 ;
const u32 r = hc_byte_perm_S ( b, a, ( 0x76543210 >> ( c_mod_4 * 4 ) ) & 0xffff ) ;
return r ;
}
DECLSPEC void memcat_le_S ( u32 *block, const u32 offset, const u32 *append, u32 len )
{
const u32 start_index = ( offset - 1 ) >> 2 ;
const u32 count = ( ( offset + len + 3 ) >> 2 ) - start_index ;
const int off_mod_4 = offset & 3 ;
const int off_minus_4 = 4 - off_mod_4 ;
block[start_index] | = hc_bytealign_be_S ( append[0], 0 , off_minus_4 ) ;
block[start_index] | = hc_bytealign_ l e_S ( append[0], 0 , off_minus_4 ) ;
for ( u32 idx = 1 ; idx < count; idx++)
{
block[start_index + idx] = hc_bytealign_ b e_S ( append[idx], append[idx - 1], off_minus_4 ) ;
block[start_index + idx] = hc_bytealign_ l e_S ( append[idx], append[idx - 1], off_minus_4 ) ;
}
}
DECLSPEC void memzero_ b e_S ( u32 *block, const u32 start_offset, const u32 end_offset )
DECLSPEC void memzero_ l e_S ( u32 *block, const u32 start_offset, const u32 end_offset )
{
const u32 start_idx = ( start_offset + 3 ) / 4 ;
const u32 start_idx = start_offset / 4 ;
// zero out bytes in the first u32 starting from 'start_offset '
block[start_idx] &= 0xffffffff >> ( ( 4 - ( start_offset & 3 ) ) * 8 ) ;
const u32 end_idx = ( end_offset + 3 ) / 4 ;
// zero out bytes in u32 units -- note that the last u32 is completely zeroed!
for ( u32 i = start_idx + 1 ; i < end_idx; i++)
{
block[i] = 0 ;
}
}
DECLSPEC void memzero_be_S ( u32 *block, const u32 start_offset, const u32 end_offset )
{
const u32 start_idx = start_offset / 4 ;
// zero out bytes in the first u32 starting from 'start_offset '
block[start_idx - 1] &= 0xffffffff >> ( ( ( 4 - start_offset ) & 3 ) * 8 ) ;
block[start_idx] &= 0xffffffff << ( ( 4 - ( start_offset & 3 ) ) * 8 ) ;
const u32 end_idx = ( end_offset + 3 ) / 4 ;
// zero out bytes in u32 units -- note that the last u32 is completely zeroed!
for ( u32 i = start_idx ; i < end_idx; i++)
for ( u32 i = start_idx + 1 ; i < end_idx; i++)
{
block[i] = 0 ;
}
@ -143,13 +170,14 @@ DECLSPEC int check_decoded_data (u32 *decoded_data, const u32 decoded_data_size)
const u32 sha1_u32_off = sha1_byte_off / 4 ;
u32 expected_sha1[5] ;
expected_sha1[0] = hc_bytealign_be_S ( decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off ) ;
expected_sha1[1] = hc_bytealign_be_S ( decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off ) ;
expected_sha1[2] = hc_bytealign_be_S ( decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off ) ;
expected_sha1[3] = hc_bytealign_be_S ( decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off ) ;
expected_sha1[4] = hc_bytealign_be_S ( decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off ) ;
memzero_be_S ( decoded_data, sha1_byte_off, 384 * sizeof ( u32 ) ) ;
expected_sha1[0] = hc_bytealign_le_S ( decoded_data[sha1_u32_off + 1], decoded_data[sha1_u32_off + 0], sha1_byte_off ) ;
expected_sha1[1] = hc_bytealign_le_S ( decoded_data[sha1_u32_off + 2], decoded_data[sha1_u32_off + 1], sha1_byte_off ) ;
expected_sha1[2] = hc_bytealign_le_S ( decoded_data[sha1_u32_off + 3], decoded_data[sha1_u32_off + 2], sha1_byte_off ) ;
expected_sha1[3] = hc_bytealign_le_S ( decoded_data[sha1_u32_off + 4], decoded_data[sha1_u32_off + 3], sha1_byte_off ) ;
expected_sha1[4] = hc_bytealign_le_S ( decoded_data[sha1_u32_off + 5], decoded_data[sha1_u32_off + 4], sha1_byte_off ) ;
memzero_le_S ( decoded_data, sha1_byte_off, 384 * sizeof ( u32 ) ) ;
sha1_ctx_t ctx ;
@ -188,14 +216,31 @@ KERNEL_FQ void m17010_init (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
// create a number of copies for efficiency
const u32 copies = 80 * sizeof ( u32 ) / salted_pw_len ;
for ( u32 idx = 1 ; idx < copies; idx++)
{
memcat_ b e_S ( salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len ) ;
memcat_ l e_S ( salted_pw_block, idx * salted_pw_len, salted_pw_block, salted_pw_len ) ;
}
for ( u32 idx = 0 ; idx < 80; idx++) tmps[gid].salted_pw_block[idx] = salted_pw_block[idx];
for ( u32 idx = 0 ; idx < 80; idx++)
{
tmps[gid].salted_pw_block[idx] = hc_swap32_S ( salted_pw_block[idx] ) ;
}
tmps[gid].salted_pw_block_len = ( copies * salted_pw_len ) ;
tmps[gid].h[0] = SHA1M_A ;
tmps[gid].h[1] = SHA1M_B ;
tmps[gid].h[2] = SHA1M_C ;
tmps[gid].h[3] = SHA1M_D ;
tmps[gid].h[4] = SHA1M_E ;
tmps[gid].h[5] = SHA1M_A ;
tmps[gid].h[6] = SHA1M_B ;
tmps[gid].h[7] = SHA1M_C ;
tmps[gid].h[8] = SHA1M_D ;
tmps[gid].h[9] = SHA1M_E ;
tmps[gid].len = 0 ;
}
KERNEL_FQ void m17010_loop_prepare ( KERN_ATTR_TMPS_ESALT ( gpg_tmp_t, gpg_t ) )
@ -204,31 +249,24 @@ KERNEL_FQ void m17010_loop_prepare (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
if ( gid >= gid_max ) return ;
/**
* context save
*/
sha1_ctx_t ctx ;
sha1_init ( &ctx ) ;
// padd with one or more zeroes for larger target key sizes, e.g. for AES-256
if ( salt_repeat > 0 )
{
u32 zeroes[16] = {0} ;
sha1_update ( &ctx, zeroes, salt_repeat ) ;
}
const u32 sha_offset = salt_repeat * 5 ;
for ( int i = 0 ; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w3[i] = ctx.w3[i];
tmps[gid].len = ctx.len ;
tmps[gid].w0[0] = 0 ;
tmps[gid].w0[1] = 0 ;
tmps[gid].w0[2] = 0 ;
tmps[gid].w0[3] = 0 ;
tmps[gid].w1[0] = 0 ;
tmps[gid].w1[1] = 0 ;
tmps[gid].w1[2] = 0 ;
tmps[gid].w1[3] = 0 ;
tmps[gid].w2[0] = 0 ;
tmps[gid].w2[1] = 0 ;
tmps[gid].w2[2] = 0 ;
tmps[gid].w2[3] = 0 ;
tmps[gid].w3[0] = 0 ;
tmps[gid].w3[1] = 0 ;
tmps[gid].w3[2] = 0 ;
tmps[gid].w3[3] = 0 ;
tmps[gid].len = salt_repeat ;
}
KERNEL_FQ void m17010_loop ( KERN_ATTR_TMPS_ESALT ( gpg_tmp_t, gpg_t ) )
@ -236,12 +274,15 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
const u64 gid = get_global_id ( 0 ) ;
if ( gid >= gid_max ) return ;
// get the prepared buffer from the gpg_tmp_t struct into a local buffer
u32 salted_pw_block[80] ;
for ( int i = 0 ; i < 80; i++) salted_pw_block[i] = tmps[gid].salted_pw_block[i];
const u32 salted_pw_block_len = tmps[gid].salted_pw_block_len ;
// do we really need this, since the salt is always length 8?
if ( salted_pw_block_len == 0 ) return ;
/**
@ -253,6 +294,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
const u32 sha_offset = salt_repeat * 5 ;
for ( int i = 0 ; i < 5; i++) ctx.h[i] = tmps[gid].h[sha_offset + i];
for ( int i = 0 ; i < 4; i++) ctx.w0[i] = tmps[gid].w0[i];
for ( int i = 0 ; i < 4; i++) ctx.w1[i] = tmps[gid].w1[i];
for ( int i = 0 ; i < 4; i++) ctx.w2[i] = tmps[gid].w2[i];
@ -268,7 +310,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
for ( u32 i = 0 ; i < rounds; i++)
{
sha1_update _swap ( &ctx, salted_pw_block, salted_pw_block_len ) ;
sha1_update ( &ctx, salted_pw_block, salted_pw_block_len ) ;
}
if ( ( loop_pos + loop_cnt ) == salt_iter )
@ -279,7 +321,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
{
memzero_be_S ( salted_pw_block, remaining_bytes, salted_pw_block_len ) ;
sha1_update _swap ( &ctx, salted_pw_block, remaining_bytes ) ;
sha1_update ( &ctx, salted_pw_block, remaining_bytes ) ;
}
sha1_final ( &ctx ) ;
@ -290,6 +332,7 @@ KERNEL_FQ void m17010_loop (KERN_ATTR_TMPS_ESALT (gpg_tmp_t, gpg_t))
*/
for ( int i = 0 ; i < 5; i++) tmps[gid].h[sha_offset + i] = ctx.h[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w0[i] = ctx.w0[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w1[i] = ctx.w1[i];
for ( int i = 0 ; i < 4; i++) tmps[gid].w2[i] = ctx.w2[i];