/** * Author......: See docs/credits.txt * License.....: MIT */ #define NEW_SIMD_CODE #ifdef KERNEL_STATIC #include "inc_vendor.h" #include "inc_types.h" #include "inc_platform.cl" #include "inc_common.cl" #include "inc_simd.cl" #include "inc_hash_sha1.cl" #endif CONSTANT_VK u32 bin2base64[0x40] = { 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2b, 0x2f, }; #if VECT_SIZE == 1 #define int_to_base64(c) make_u32x (s_bin2base64[(c)]) #elif VECT_SIZE == 2 #define int_to_base64(c) make_u32x (s_bin2base64[(c).s0], s_bin2base64[(c).s1]) #elif VECT_SIZE == 4 #define int_to_base64(c) make_u32x (s_bin2base64[(c).s0], s_bin2base64[(c).s1], s_bin2base64[(c).s2], s_bin2base64[(c).s3]) #elif VECT_SIZE == 8 #define int_to_base64(c) make_u32x (s_bin2base64[(c).s0], s_bin2base64[(c).s1], s_bin2base64[(c).s2], s_bin2base64[(c).s3], s_bin2base64[(c).s4], s_bin2base64[(c).s5], s_bin2base64[(c).s6], s_bin2base64[(c).s7]) #elif VECT_SIZE == 16 #define int_to_base64(c) make_u32x (s_bin2base64[(c).s0], s_bin2base64[(c).s1], s_bin2base64[(c).s2], s_bin2base64[(c).s3], s_bin2base64[(c).s4], s_bin2base64[(c).s5], s_bin2base64[(c).s6], s_bin2base64[(c).s7], s_bin2base64[(c).s8], s_bin2base64[(c).s9], s_bin2base64[(c).sa], s_bin2base64[(c).sb], s_bin2base64[(c).sc], s_bin2base64[(c).sd], s_bin2base64[(c).se], s_bin2base64[(c).sf]) #endif KERNEL_FQ void m28300_mxx (KERN_ATTR_VECTOR ()) { /** * base */ const u64 gid = get_global_id (0); const u64 lid = get_local_id (0); const u64 lsz = get_local_size (0); /** * sbox */ #ifdef REAL_SHM LOCAL_VK u32 s_bin2base64[0x40]; for (u32 i = lid; i < 0x40; i += lsz) { s_bin2base64[i] = bin2base64[i]; } SYNC_THREADS (); #else CONSTANT_AS u32a *s_bin2base64 = bin2base64; #endif if (gid >= gid_max) return; /** * base */ const u32 pw_len = pws[gid].pw_len; u32x w[64] = { 0 }; for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) { w[idx] = pws[gid].i[idx]; } const u32 salt_len = salt_bufs[SALT_POS].salt_len; u32x s[64] = { 0 }; for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1) { s[idx] = salt_bufs[SALT_POS].salt_buf[idx]; } /** * loop */ u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; const u32x w0 = w0l | w0r; w[0] = w0; sha1_ctx_vector_t ctx; sha1_init_vector (&ctx); sha1_update_vector (&ctx, w, pw_len); sha1_final_vector (&ctx); u32x h[5]; h[0] = ctx.h[0]; h[1] = ctx.h[1]; h[2] = ctx.h[2]; h[3] = ctx.h[3]; h[4] = ctx.h[4]; #define tmp_u8_00 ((h[0] >> 26) & 0x3f) #define tmp_u8_01 ((h[0] >> 20) & 0x3f) #define tmp_u8_02 ((h[0] >> 14) & 0x3f) #define tmp_u8_03 ((h[0] >> 8) & 0x3f) #define tmp_u8_04 ((h[0] >> 2) & 0x3f) #define tmp_u8_05 ((h[0] << 4) & 0x3c) | ((h[1] >> 28) & 0x0f) #define tmp_u8_06 ((h[1] >> 22) & 0x3f) #define tmp_u8_07 ((h[1] >> 16) & 0x3f) #define tmp_u8_08 ((h[1] >> 10) & 0x3f) #define tmp_u8_09 ((h[1] >> 4) & 0x3f) #define tmp_u8_10 ((h[1] << 2) & 0x3c) | ((h[2] >> 30) & 0x03) #define tmp_u8_11 ((h[2] >> 24) & 0x3f) #define tmp_u8_12 ((h[2] >> 18) & 0x3f) #define tmp_u8_13 ((h[2] >> 12) & 0x3f) #define tmp_u8_14 ((h[2] >> 6) & 0x3f) #define tmp_u8_15 ((h[2] >> 0) & 0x3f) #define tmp_u8_16 ((h[3] >> 26) & 0x3f) #define tmp_u8_17 ((h[3] >> 20) & 0x3f) #define tmp_u8_18 ((h[3] >> 14) & 0x3f) #define tmp_u8_19 ((h[3] >> 8) & 0x3f) #define tmp_u8_20 ((h[3] >> 2) & 0x3f) #define tmp_u8_21 ((h[3] << 4) & 0x3c) | ((h[4] >> 28) & 0x0f) #define tmp_u8_22 ((h[4] >> 22) & 0x3f) #define tmp_u8_23 ((h[4] >> 16) & 0x3f) #define tmp_u8_24 ((h[4] >> 10) & 0x3f) #define tmp_u8_25 ((h[4] >> 4) & 0x3f) #define tmp_u8_26 ((h[4] << 2) & 0x3c) sha1_init_vector (&ctx); ctx.w0[0] = int_to_base64 (tmp_u8_00) << 24 | int_to_base64 (tmp_u8_01) << 16 | int_to_base64 (tmp_u8_02) << 8 | int_to_base64 (tmp_u8_03) << 0; ctx.w0[1] = int_to_base64 (tmp_u8_04) << 24 | int_to_base64 (tmp_u8_05) << 16 | int_to_base64 (tmp_u8_06) << 8 | int_to_base64 (tmp_u8_07) << 0; ctx.w0[2] = int_to_base64 (tmp_u8_08) << 24 | int_to_base64 (tmp_u8_09) << 16 | int_to_base64 (tmp_u8_10) << 8 | int_to_base64 (tmp_u8_11) << 0; ctx.w0[3] = int_to_base64 (tmp_u8_12) << 24 | int_to_base64 (tmp_u8_13) << 16 | int_to_base64 (tmp_u8_14) << 8 | int_to_base64 (tmp_u8_15) << 0; ctx.w1[0] = int_to_base64 (tmp_u8_16) << 24 | int_to_base64 (tmp_u8_17) << 16 | int_to_base64 (tmp_u8_18) << 8 | int_to_base64 (tmp_u8_19) << 0; ctx.w1[1] = int_to_base64 (tmp_u8_20) << 24 | int_to_base64 (tmp_u8_21) << 16 | int_to_base64 (tmp_u8_22) << 8 | int_to_base64 (tmp_u8_23) << 0; ctx.w1[2] = int_to_base64 (tmp_u8_24) << 24 | int_to_base64 (tmp_u8_25) << 16 | int_to_base64 (tmp_u8_26) << 8 | '=' << 0; ctx.len = 28; sha1_update_vector (&ctx, s, 152); sha1_final_vector (&ctx); const u32x r0 = ctx.h[DGST_R0]; const u32x r1 = ctx.h[DGST_R1]; const u32x r2 = ctx.h[DGST_R2]; const u32x r3 = ctx.h[DGST_R3]; COMPARE_M_SIMD (r0, r1, r2, r3); } } KERNEL_FQ void m28300_sxx (KERN_ATTR_VECTOR ()) { /** * base */ const u64 gid = get_global_id (0); const u64 lid = get_local_id (0); const u64 lsz = get_local_size (0); /** * sbox */ #ifdef REAL_SHM LOCAL_VK u32 s_bin2base64[0x40]; for (u32 i = lid; i < 0x40; i += lsz) { s_bin2base64[i] = bin2base64[i]; } SYNC_THREADS (); #else CONSTANT_AS u32a *s_bin2base64 = bin2base64; #endif if (gid >= gid_max) return; /** * digest */ const u32 search[4] = { digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0], digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1], digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2], digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3] }; /** * base */ const u32 pw_len = pws[gid].pw_len; u32x w[64] = { 0 }; for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1) { w[idx] = pws[gid].i[idx]; } const u32 salt_len = salt_bufs[SALT_POS].salt_len; u32x s[64] = { 0 }; for (u32 i = 0, idx = 0; i < salt_len; i += 4, idx += 1) { s[idx] = salt_bufs[SALT_POS].salt_buf[idx]; } /** * loop */ u32x w0l = w[0]; for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; const u32x w0 = w0l | w0r; w[0] = w0; sha1_ctx_vector_t ctx; sha1_init_vector (&ctx); sha1_update_vector (&ctx, w, pw_len); sha1_final_vector (&ctx); u32x h[5]; h[0] = ctx.h[0]; h[1] = ctx.h[1]; h[2] = ctx.h[2]; h[3] = ctx.h[3]; h[4] = ctx.h[4]; #define tmp_u8_00 ((h[0] >> 26) & 0x3f) #define tmp_u8_01 ((h[0] >> 20) & 0x3f) #define tmp_u8_02 ((h[0] >> 14) & 0x3f) #define tmp_u8_03 ((h[0] >> 8) & 0x3f) #define tmp_u8_04 ((h[0] >> 2) & 0x3f) #define tmp_u8_05 ((h[0] << 4) & 0x3c) | ((h[1] >> 28) & 0x0f) #define tmp_u8_06 ((h[1] >> 22) & 0x3f) #define tmp_u8_07 ((h[1] >> 16) & 0x3f) #define tmp_u8_08 ((h[1] >> 10) & 0x3f) #define tmp_u8_09 ((h[1] >> 4) & 0x3f) #define tmp_u8_10 ((h[1] << 2) & 0x3c) | ((h[2] >> 30) & 0x03) #define tmp_u8_11 ((h[2] >> 24) & 0x3f) #define tmp_u8_12 ((h[2] >> 18) & 0x3f) #define tmp_u8_13 ((h[2] >> 12) & 0x3f) #define tmp_u8_14 ((h[2] >> 6) & 0x3f) #define tmp_u8_15 ((h[2] >> 0) & 0x3f) #define tmp_u8_16 ((h[3] >> 26) & 0x3f) #define tmp_u8_17 ((h[3] >> 20) & 0x3f) #define tmp_u8_18 ((h[3] >> 14) & 0x3f) #define tmp_u8_19 ((h[3] >> 8) & 0x3f) #define tmp_u8_20 ((h[3] >> 2) & 0x3f) #define tmp_u8_21 ((h[3] << 4) & 0x3c) | ((h[4] >> 28) & 0x0f) #define tmp_u8_22 ((h[4] >> 22) & 0x3f) #define tmp_u8_23 ((h[4] >> 16) & 0x3f) #define tmp_u8_24 ((h[4] >> 10) & 0x3f) #define tmp_u8_25 ((h[4] >> 4) & 0x3f) #define tmp_u8_26 ((h[4] << 2) & 0x3c) sha1_init_vector (&ctx); ctx.w0[0] = int_to_base64 (tmp_u8_00) << 24 | int_to_base64 (tmp_u8_01) << 16 | int_to_base64 (tmp_u8_02) << 8 | int_to_base64 (tmp_u8_03) << 0; ctx.w0[1] = int_to_base64 (tmp_u8_04) << 24 | int_to_base64 (tmp_u8_05) << 16 | int_to_base64 (tmp_u8_06) << 8 | int_to_base64 (tmp_u8_07) << 0; ctx.w0[2] = int_to_base64 (tmp_u8_08) << 24 | int_to_base64 (tmp_u8_09) << 16 | int_to_base64 (tmp_u8_10) << 8 | int_to_base64 (tmp_u8_11) << 0; ctx.w0[3] = int_to_base64 (tmp_u8_12) << 24 | int_to_base64 (tmp_u8_13) << 16 | int_to_base64 (tmp_u8_14) << 8 | int_to_base64 (tmp_u8_15) << 0; ctx.w1[0] = int_to_base64 (tmp_u8_16) << 24 | int_to_base64 (tmp_u8_17) << 16 | int_to_base64 (tmp_u8_18) << 8 | int_to_base64 (tmp_u8_19) << 0; ctx.w1[1] = int_to_base64 (tmp_u8_20) << 24 | int_to_base64 (tmp_u8_21) << 16 | int_to_base64 (tmp_u8_22) << 8 | int_to_base64 (tmp_u8_23) << 0; ctx.w1[2] = int_to_base64 (tmp_u8_24) << 24 | int_to_base64 (tmp_u8_25) << 16 | int_to_base64 (tmp_u8_26) << 8 | '=' << 0; ctx.len = 28; sha1_update_vector (&ctx, s, 152); sha1_final_vector (&ctx); const u32x r0 = ctx.h[DGST_R0]; const u32x r1 = ctx.h[DGST_R1]; const u32x r2 = ctx.h[DGST_R2]; const u32x r3 = ctx.h[DGST_R3]; COMPARE_S_SIMD (r0, r1, r2, r3); } }