From 71a8f97294ae45ec01c6b30eea4c3c2b6d84ba1d Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Mon, 5 Apr 2021 17:59:42 +0200 Subject: [PATCH] Optimize GCM code to use only u32 data types, make it CUDA compatible and remove some branches --- OpenCL/inc_cipher_aes-gcm.cl | 154 +++++++----------- OpenCL/inc_cipher_aes-gcm.h | 6 +- ...27000-optimized.cl => m25500-optimized.cl} | 58 ++----- OpenCL/{m27000-pure.cl => m25500-pure.cl} | 21 +-- docs/changes.txt | 2 +- docs/readme.txt | 2 +- .../{module_27000.c => module_25500.c} | 11 +- 7 files changed, 88 insertions(+), 166 deletions(-) rename OpenCL/{m27000-optimized.cl => m25500-optimized.cl} (89%) rename OpenCL/{m27000-pure.cl => m25500-pure.cl} (97%) rename src/modules/{module_27000.c => module_25500.c} (97%) diff --git a/OpenCL/inc_cipher_aes-gcm.cl b/OpenCL/inc_cipher_aes-gcm.cl index efc05bd09..97d7b0f28 100644 --- a/OpenCL/inc_cipher_aes-gcm.cl +++ b/OpenCL/inc_cipher_aes-gcm.cl @@ -10,104 +10,55 @@ #include "inc_cipher_aes.h" #include "inc_cipher_aes-gcm.h" -#ifndef AES_GCM_ALT1 -DECLSPEC void AES_GCM_shift_right_block(uchar *block) -{ - u32 val; - - uchar16 *v = (uchar16 *) block; - uint4 *p = (uint4 *) block; - - val = hc_swap32_S (p[0].w); - val >>= 1; - if (v[0].sb & 0x01) val |= 0x80000000; - p[0].w = hc_swap32_S (val); - - val = hc_swap32_S (p[0].z); - val >>= 1; - if (v[0].s7 & 0x01) val |= 0x80000000; - p[0].z = hc_swap32_S (val); - - val = hc_swap32_S (p[0].y); - val >>= 1; - if (v[0].s3 & 0x01) val |= 0x80000000; - p[0].y = hc_swap32_S (val); - - val = hc_swap32_S (p[0].x); - val >>= 1; - p[0].x = hc_swap32_S (val); -} -#endif // AES_GCM_ALT1 - DECLSPEC void AES_GCM_inc32 (u32 *block) { - block[3] += 0x00000001; + block[3] += 1; } DECLSPEC void AES_GCM_xor_block (u32 *dst, const u32 *src) { - *dst++ ^= *src++; - *dst++ ^= *src++; - *dst++ ^= *src++; - *dst++ ^= *src++; + dst[0] ^= src[0]; + dst[1] ^= src[1]; + dst[2] ^= src[2]; + dst[3] ^= src[3]; } -DECLSPEC void AES_GCM_gf_mult (const uchar16 *x, const uchar16 *y, uchar16 *z) +DECLSPEC void AES_GCM_gf_mult (const u32 *x, const u32 *y, u32 *z) { - u32 i, j; - z[0] = 0; + z[1] = 0; + z[2] = 0; + z[3] = 0; - uchar16 v = y[0].s32107654ba98fedc; + u32 t[4]; - u8 x_char[16] = { x[0].s3, x[0].s2, x[0].s1, x[0].s0, x[0].s7, x[0].s6, x[0].s5, x[0].s4, x[0].sb, x[0].sa, x[0].s9, x[0].s8, x[0].sf, x[0].se, x[0].sd, x[0].sc }; + t[0] = y[0]; + t[1] = y[1]; + t[2] = y[2]; + t[3] = y[3]; - #ifndef AES_GCM_ALT1 - u8 *v_char = (u8 *) &v; - #endif - - u32 *i_char = (u32 *) &v; - - u8 t = 0; - - for (i = 0; i < 16; i++) + for (int i = 0; i < 4; i++) { - for (j = 0; j < 8; j++) + const u32 tv = x[i]; + + for (int j = 0; j < 32; j++) { - if (x_char[i] & 1 << (7 - j)) + if ((tv >> (31 - j)) & 1) { - z[0] ^= v; + z[0] ^= t[0]; + z[1] ^= t[1]; + z[2] ^= t[2]; + z[3] ^= t[3]; } - t = v.sf & 0x01; + const int m = t[3] & 1; // save lost bit - #ifndef AES_GCM_ALT1 + t[3] = (t[2] << 31) | (t[3] >> 1); + t[2] = (t[1] << 31) | (t[2] >> 1); + t[1] = (t[0] << 31) | (t[1] >> 1); + t[0] = 0 | (t[0] >> 1); - AES_GCM_shift_right_block(v_char); - - #else - - i_char[0] = hc_swap32_S (i_char[0]); - i_char[1] = hc_swap32_S (i_char[1]); - i_char[2] = hc_swap32_S (i_char[2]); - i_char[3] = hc_swap32_S (i_char[3]); - - i_char[3] = (i_char[3] >> 1) | (i_char[2] << 31); - i_char[2] = (i_char[2] >> 1) | (i_char[1] << 31); - i_char[1] = (i_char[1] >> 1) | (i_char[0] << 31); - i_char[0] >>= 1; - - i_char[0] = hc_swap32_S (i_char[0]); - i_char[1] = hc_swap32_S (i_char[1]); - i_char[2] = hc_swap32_S (i_char[2]); - i_char[3] = hc_swap32_S (i_char[3]); - - #endif // AES_GCM_ALT1 - - if (t) - { - v.s0 ^= 0xe1; - } + t[0] ^= m * 0xe1000000; } } } @@ -126,12 +77,7 @@ DECLSPEC void AES_GCM_ghash (const u32 *subkey, const u32 *in, u32 in_len, u32 * xpos += 4; - AES_GCM_gf_mult ((uchar16 *) out, (uchar16 *) subkey, (uchar16 *) tmp); - - tmp[0] = hc_swap32_S (tmp[0]); - tmp[1] = hc_swap32_S (tmp[1]); - tmp[2] = hc_swap32_S (tmp[2]); - tmp[3] = hc_swap32_S (tmp[3]); + AES_GCM_gf_mult (out, subkey, tmp); out[0] = tmp[0]; out[1] = tmp[1]; @@ -155,7 +101,12 @@ DECLSPEC void AES_GCM_ghash (const u32 *subkey, const u32 *in, u32 in_len, u32 * AES_GCM_xor_block (out, tmp); - AES_GCM_gf_mult ((uchar16 *) out, (uchar16 *) subkey, (uchar16 *) tmp); + AES_GCM_gf_mult (out, subkey, tmp); + + tmp[0] = hc_swap32_S (tmp[0]); + tmp[1] = hc_swap32_S (tmp[1]); + tmp[2] = hc_swap32_S (tmp[2]); + tmp[3] = hc_swap32_S (tmp[3]); out[0] = tmp[0]; out[1] = tmp[1]; @@ -202,8 +153,11 @@ DECLSPEC void AES_GCM_Prepare_J0 (const u32 *iv, u32 iv_len, const u32 *subkey, J0[2] = iv[2]; J0[3] = iv[3]; - u32 len_buf[4] = { 0 }; + u32 len_buf[4]; + len_buf[0] = 0; + len_buf[1] = 0; + len_buf[2] = 0; len_buf[3] = iv_len * 8; AES_GCM_ghash (subkey, len_buf, 16, J0); @@ -213,11 +167,17 @@ DECLSPEC void AES_GCM_Prepare_J0 (const u32 *iv, u32 iv_len, const u32 *subkey, DECLSPEC void AES_GCM_gctr (const u32 *key, const u32 *iv, const u32 *in, u32 in_len, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) { const u32 *xpos = in; + u32 *ypos = out; - u32 n = in_len / 16; + u32 iv_buf[4]; - u32 iv_buf[4] = { iv[0], iv[1], iv[2], iv[3] }; + iv_buf[0] = iv[0]; + iv_buf[1] = iv[1]; + iv_buf[2] = iv[2]; + iv_buf[3] = iv[3]; + + const u32 n = in_len / 16; for (u32 i = 0; i < n; i++) { @@ -247,20 +207,18 @@ DECLSPEC void AES_GCM_gctr (const u32 *key, const u32 *iv, const u32 *in, u32 in DECLSPEC void AES_GCM_GCTR (u32 *key, u32 *J0, u32 *in, u32 in_len, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) { - u32 J0_incr[4] = { - J0[0], - J0[1], - J0[2], - J0[3], - }; + u32 J0_incr[4]; + + J0_incr[0] = J0[0]; + J0_incr[1] = J0[1]; + J0_incr[2] = J0[2]; + J0_incr[3] = J0[3]; AES_GCM_gctr (key, J0_incr, in, in_len, out, s_te0, s_te1, s_te2, s_te3, s_te4); } DECLSPEC void AES_GCM_GHASH (const u32 *subkey, const u32 *aad_buf, u32 aad_len, u32 *enc_buf, u32 enc_len, u32 *out) { - u32 len_buf[4] = { 0 }; - out[0] = 0; out[1] = 0; out[2] = 0; @@ -283,7 +241,11 @@ DECLSPEC void AES_GCM_GHASH (const u32 *subkey, const u32 *aad_buf, u32 aad_len, out[2] = hc_swap32_S (out[2]); out[3] = hc_swap32_S (out[3]); + u32 len_buf[4]; + len_buf[0] = aad_len * 8; + len_buf[1] = 0; + len_buf[2] = 0; len_buf[3] = enc_len * 8; AES_GCM_ghash (subkey, len_buf, 16, out); diff --git a/OpenCL/inc_cipher_aes-gcm.h b/OpenCL/inc_cipher_aes-gcm.h index 33e43ed12..ba44729e4 100644 --- a/OpenCL/inc_cipher_aes-gcm.h +++ b/OpenCL/inc_cipher_aes-gcm.h @@ -6,13 +6,9 @@ #ifndef _INC_CIPHER_AES_GCM_H #define _INC_CIPHER_AES_GCM_H -#ifndef AES_GCM_ALT1 -DECLSPEC void AES_GCM_shift_right_block(uchar *block); -#endif - DECLSPEC void AES_GCM_inc32 (u32 *block); DECLSPEC void AES_GCM_xor_block (u32 *dst, const u32 *src); -DECLSPEC void AES_GCM_gf_mult (const uchar16 *x, const uchar16 *y, uchar16 *z); +DECLSPEC void AES_GCM_gf_mult (const u32 *x, const u32 *y, u32 *z); DECLSPEC void AES_GCM_ghash (const u32 *subkey, const u32 *in, u32 in_len, u32 *out); DECLSPEC void AES_GCM_Init (const u32 *ukey, u32 key_len, u32 *key, u32 *subkey, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4); DECLSPEC void AES_GCM_Prepare_J0 (const u32 *iv, u32 iv_len, const u32 *subkey, u32 *J0); diff --git a/OpenCL/m27000-optimized.cl b/OpenCL/m25500-optimized.cl similarity index 89% rename from OpenCL/m27000-optimized.cl rename to OpenCL/m25500-optimized.cl index 53cde203f..fde776d58 100644 --- a/OpenCL/m27000-optimized.cl +++ b/OpenCL/m25500-optimized.cl @@ -82,7 +82,7 @@ DECLSPEC void hmac_sha256_run_V (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *i sha256_transform_vector (w0, w1, w2, w3, digest); } -KERNEL_FQ void m27000_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { /** * base @@ -166,7 +166,7 @@ KERNEL_FQ void m27000_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh } } -KERNEL_FQ void m27000_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { const u64 gid = get_global_id (0); @@ -272,15 +272,11 @@ KERNEL_FQ void m27000_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh } } -KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { - /** - * base - */ - const u64 gid = get_global_id (0); - - if (gid >= gid_max) return; + const u64 lid = get_local_id (0); + const u64 lsz = get_local_size (0); /** * aes shared @@ -288,9 +284,6 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh #ifdef REAL_SHM - const u64 lid = get_local_id (0); - const u64 lsz = get_local_size (0); - LOCAL_VK u32 s_te0[256]; LOCAL_VK u32 s_te1[256]; LOCAL_VK u32 s_te2[256]; @@ -387,36 +380,21 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh S[2] ^= enc[2]; S[3] ^= enc[3]; - AES_GCM_gf_mult ((uchar16 *) S, (uchar16 *) subKey, (uchar16 *) t); - - t[0] = hc_swap32_S (t[0]); - t[1] = hc_swap32_S (t[1]); - t[2] = hc_swap32_S (t[2]); - t[3] = hc_swap32_S (t[3]); + AES_GCM_gf_mult (S, subKey, t); S[0] = t[0] ^ enc[4]; S[1] = t[1] ^ enc[5]; S[2] = t[2] ^ enc[6]; S[3] = t[3] ^ enc[7]; - AES_GCM_gf_mult ((uchar16 *) S, (uchar16 *) subKey, (uchar16 *) t); - - t[0] = hc_swap32_S (t[0]); - t[1] = hc_swap32_S (t[1]); - t[2] = hc_swap32_S (t[2]); - t[3] = hc_swap32_S (t[3]); + AES_GCM_gf_mult (S, subKey, t); S[0] = t[0] ^ enc[8]; S[1] = t[1] ^ enc[9]; S[2] = t[2] ^ enc[10]; S[3] = t[3] ^ enc[11]; - AES_GCM_gf_mult ((uchar16 *) S, (uchar16 *) subKey, (uchar16 *) t); - - t[0] = hc_swap32_S (t[0]); - t[1] = hc_swap32_S (t[1]); - t[2] = hc_swap32_S (t[2]); - t[3] = hc_swap32_S (t[3]); + AES_GCM_gf_mult (S, subKey, t); S[0] = t[0]; S[1] = t[1]; @@ -433,12 +411,12 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh S[2] ^= t[2]; S[3] ^= t[3]; - AES_GCM_gf_mult ((uchar16 *) S, (uchar16 *) subKey, (uchar16 *) t); + AES_GCM_gf_mult (S, subKey, t); - S[0] = hc_swap32_S (t[0]); - S[1] = hc_swap32_S (t[1]); - S[2] = hc_swap32_S (t[2]); - S[3] = hc_swap32_S (t[3]); + S[0] = t[0]; + S[1] = t[1]; + S[2] = t[2]; + S[3] = t[3]; u32 len_buf[4] = { 0 }; @@ -450,12 +428,12 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh S[2] ^= len_buf[2]; S[3] ^= len_buf[3]; - AES_GCM_gf_mult ((uchar16 *) S, (uchar16 *) subKey, (uchar16 *) t); + AES_GCM_gf_mult (S, subKey, t); - S[0] = hc_swap32_S (t[0]); - S[1] = hc_swap32_S (t[1]); - S[2] = hc_swap32_S (t[2]); - S[3] = hc_swap32_S (t[3]); + S[0] = t[0]; + S[1] = t[1]; + S[2] = t[2]; + S[3] = t[3]; J0[3] = 0x00000001; diff --git a/OpenCL/m27000-pure.cl b/OpenCL/m25500-pure.cl similarity index 97% rename from OpenCL/m27000-pure.cl rename to OpenCL/m25500-pure.cl index 30151a0dc..3bb9c3af6 100644 --- a/OpenCL/m27000-pure.cl +++ b/OpenCL/m25500-pure.cl @@ -82,7 +82,7 @@ DECLSPEC void hmac_sha256_run_V (u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *i sha256_transform_vector (w0, w1, w2, w3, digest); } -KERNEL_FQ void m27000_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { /** * base @@ -166,7 +166,7 @@ KERNEL_FQ void m27000_init (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh } } -KERNEL_FQ void m27000_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { const u64 gid = get_global_id (0); @@ -272,15 +272,11 @@ KERNEL_FQ void m27000_loop (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh } } -KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) +KERNEL_FQ void m25500_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sha256_aes_gcm_t)) { - /** - * base - */ - const u64 gid = get_global_id (0); - - if (gid >= gid_max) return; + const u64 lid = get_local_id (0); + const u64 lsz = get_local_size (0); /** * aes shared @@ -288,9 +284,6 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh #ifdef REAL_SHM - const u64 lid = get_local_id (0); - const u64 lsz = get_local_size (0); - LOCAL_VK u32 s_te0[256]; LOCAL_VK u32 s_te1[256]; LOCAL_VK u32 s_te2[256]; @@ -376,12 +369,12 @@ KERNEL_FQ void m27000_comp (KERN_ATTR_TMPS_ESALT (pbkdf2_sha256_tmp_t, pbkdf2_sh u32 enc_len = esalt_bufs[DIGESTS_OFFSET].ct_len; -/* + /* // decrypt buffer is not usefull here, skip u32 dec[14] = { 0 }; AES_GCM_GCTR (key, J0, enc, enc_len, dec, s_te0, s_te1, s_te2, s_te3, s_te4); -*/ + */ u32 T[4] = { 0 }; u32 S[4] = { 0 }; diff --git a/docs/changes.txt b/docs/changes.txt index 80c04bcab..6aea41181 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -16,10 +16,10 @@ - Added hash-mode: RAR3-p (Uncompressed) - Added hash-mode: RSA/DSA/EC/OPENSSH Private Keys - Added hash-mode: SQLCipher +- Added hash-mode: Stargazer Stellar Wallet XLM - Added hash-mode: Stuffit5 - Added hash-mode: Umbraco HMAC-SHA1 - Added hash-mode: sha1(sha1($pass).$salt) -- Added hash-mode: Stargazer Stellar Wallet XLM, PBKDF2-HMAC-SHA256 + AES-256-GCM ## ## Features diff --git a/docs/readme.txt b/docs/readme.txt index a482e28b8..d9809a5f8 100644 --- a/docs/readme.txt +++ b/docs/readme.txt @@ -292,6 +292,7 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or - Blockchain, My Wallet - Blockchain, My Wallet, V2 - Blockchain, My Wallet, Second Password (SHA256) +- Stargazer Stellar Wallet XLM - Ethereum Pre-Sale Wallet, PBKDF2-HMAC-SHA256 - Ethereum Wallet, PBKDF2-HMAC-SHA256 - Ethereum Wallet, SCRYPT @@ -340,7 +341,6 @@ NVIDIA GPUs require "NVIDIA Driver" (440.64 or later) and "CUDA Toolkit" (9.0 or - Django (SHA-1) - Web2py pbkdf2-sha512 - TOTP (HMAC-SHA1) -- Stargazer Stellar Wallet XLM, PBKDF2-HMAC-SHA256 + AES-256-GCM - Dahua Authentication MD5 ## diff --git a/src/modules/module_27000.c b/src/modules/module_25500.c similarity index 97% rename from src/modules/module_27000.c rename to src/modules/module_25500.c index 66b213283..de4fb8e92 100644 --- a/src/modules/module_27000.c +++ b/src/modules/module_25500.c @@ -18,8 +18,8 @@ static const u32 DGST_POS2 = 2; static const u32 DGST_POS3 = 3; static const u32 DGST_SIZE = DGST_SIZE_4_4; static const u32 HASH_CATEGORY = HASH_CATEGORY_PASSWORD_MANAGER; -static const char *HASH_NAME = "Stargazer Stellar Wallet XLM, PBKDF2-HMAC-SHA256 + AES-256-GCM"; -static const u64 KERN_TYPE = 27000; +static const char *HASH_NAME = "Stargazer Stellar Wallet XLM"; +static const u64 KERN_TYPE = 25500; static const u32 OPTI_TYPE = OPTI_TYPE_ZERO_BYTE | OPTI_TYPE_SLOW_HASH_SIMD_LOOP; static const u64 OPTS_TYPE = OPTS_TYPE_PT_GENERATE_LE @@ -76,13 +76,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY return jit_build_options; } - // NVIDIA GPU - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) - { - // aes expandkey produce wrong results with this kernel if REAL_SHM is enabled - hc_asprintf (&jit_build_options, "-D _unroll -D FORCE_DISABLE_SHM"); - } - // ROCM if ((device_param->opencl_device_vendor_id == VENDOR_ID_AMD) && (device_param->has_vperm == true)) {