From ebc8acca1fc3432b5d75283a143f0fd4976e5295 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Thu, 14 Apr 2016 15:31:07 +0200 Subject: [PATCH] Cleanup -m 77xx kernels to latest standard --- OpenCL/m07700_a0.cl | 163 ++++++++--------------- OpenCL/m07700_a1.cl | 305 ++++++++++++++++++++------------------------ OpenCL/m07700_a3.cl | 65 ++++------ 3 files changed, 214 insertions(+), 319 deletions(-) diff --git a/OpenCL/m07700_a0.cl b/OpenCL/m07700_a0.cl index 237a9d406..5b1ec3502 100644 --- a/OpenCL/m07700_a0.cl +++ b/OpenCL/m07700_a0.cl @@ -5,6 +5,9 @@ #define _SAPB_ +//too much register pressure +//#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -18,9 +21,7 @@ #include "OpenCL/common.c" #include "include/rp_kernel.h" #include "OpenCL/rp.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -235,18 +236,16 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; u32 pw_buf0[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = 0; - pw_buf0[3] = 0; - u32 pw_buf1[4]; - pw_buf1[0] = 0; - pw_buf1[1] = 0; - pw_buf1[2] = 0; - pw_buf1[3] = 0; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_len = pws[gid].pw_len; @@ -271,40 +270,21 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - u32 w0[4]; - - w0[0] = pw_buf0[0]; - w0[1] = pw_buf0[1]; - w0[2] = 0; - w0[3] = 0; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; - u32 w2[4]; - - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); if (out_len > 8) continue; // otherwise it overflows in waldorf function + /** + * SAP + */ + w0[0] = sapb_trans (w0[0]); w0[1] = sapb_trans (w0[1]); @@ -313,28 +293,22 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -533,13 +507,10 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -568,18 +539,16 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; u32 pw_buf0[4]; - - pw_buf0[0] = pws[gid].i[ 0]; - pw_buf0[1] = pws[gid].i[ 1]; - pw_buf0[2] = 0; - pw_buf0[3] = 0; - u32 pw_buf1[4]; - pw_buf1[0] = 0; - pw_buf1[1] = 0; - pw_buf1[2] = 0; - pw_buf1[3] = 0; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_len = pws[gid].pw_len; @@ -616,40 +585,21 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - u32 w0[4]; - - w0[0] = pw_buf0[0]; - w0[1] = pw_buf0[1]; - w0[2] = 0; - w0[3] = 0; - - u32 w1[4]; - - w1[0] = 0; - w1[1] = 0; - w1[2] = 0; - w1[3] = 0; - - u32 w2[4]; + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; + u32x w3[4] = { 0 }; - w2[0] = 0; - w2[1] = 0; - w2[2] = 0; - w2[3] = 0; - - u32 w3[4]; - - w3[0] = 0; - w3[1] = 0; - w3[2] = 0; - w3[3] = 0; - - const u32 out_len = apply_rules (rules_buf[il_pos].cmds, w0, w1, pw_len); + const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1); if (out_len > 8) continue; // otherwise it overflows in waldorf function + /** + * SAP + */ + w0[0] = sapb_trans (w0[0]); w0[1] = sapb_trans (w0[1]); @@ -658,28 +608,22 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, */ u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -878,13 +822,10 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } diff --git a/OpenCL/m07700_a1.cl b/OpenCL/m07700_a1.cl index ef82887c2..dd14ce7b5 100644 --- a/OpenCL/m07700_a1.cl +++ b/OpenCL/m07700_a1.cl @@ -5,6 +5,9 @@ #define _SAPB_ +//too much register pressure +//#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +19,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -232,41 +233,20 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 wordl0[4]; - - wordl0[0] = pws[gid].i[ 0]; - wordl0[1] = pws[gid].i[ 1]; - wordl0[2] = 0; - wordl0[3] = 0; - - u32 wordl1[4]; - - wordl1[0] = 0; - wordl1[1] = 0; - wordl1[2] = 0; - wordl1[3] = 0; - - u32 wordl2[4]; + u32 pw_buf0[4]; + u32 pw_buf1[4]; - wordl2[0] = 0; - wordl2[1] = 0; - wordl2[2] = 0; - wordl2[3] = 0; - - u32 wordl3[4]; - - wordl3[0] = 0; - wordl3[1] = 0; - wordl3[2] = 0; - wordl3[3] = 0; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_l_len = pws[gid].pw_len; - if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) - { - switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); - } - /** * salt */ @@ -284,87 +264,92 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, salt_buf0[1] = sapb_trans (salt_buf0[1]); salt_buf0[2] = sapb_trans (salt_buf0[2]); - /** - * digest - */ - /** * loop */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 pw_r_len = combs_buf[il_pos].pw_len; - - const u32 pw_len = pw_l_len + pw_r_len; - - u32 wordr0[4]; - - wordr0[0] = combs_buf[il_pos].i[0]; - wordr0[1] = combs_buf[il_pos].i[1]; - wordr0[2] = 0; - wordr0[3] = 0; - - u32 wordr1[4]; - - wordr1[0] = 0; - wordr1[1] = 0; - wordr1[2] = 0; - wordr1[3] = 0; - - u32 wordr2[4]; + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - wordr2[0] = 0; - wordr2[1] = 0; - wordr2[2] = 0; - wordr2[3] = 0; + const u32x pw_len = pw_l_len + pw_r_len; - u32 wordr3[4]; + /** + * concat password candidate + */ - wordr3[0] = 0; - wordr3[1] = 0; - wordr3[2] = 0; - wordr3[3] = 0; + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); + } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); } - u32 w0[4]; + u32x w0[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + + if (pw_len > 8) continue; // otherwise it overflows in waldorf function + + /** + * SAP + */ - w0[0] = sapb_trans (wordl0[0] | wordr0[0]); - w0[1] = sapb_trans (wordl0[1] | wordr0[1]); - w0[2] = 0; - w0[3] = 0; + w0[0] = sapb_trans (w0[0]); + w0[1] = sapb_trans (w0[1]); /** * append salt */ u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -563,13 +548,10 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -597,41 +579,20 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (gid >= gid_max) return; - u32 wordl0[4]; - - wordl0[0] = pws[gid].i[ 0]; - wordl0[1] = pws[gid].i[ 1]; - wordl0[2] = 0; - wordl0[3] = 0; - - u32 wordl1[4]; - - wordl1[0] = 0; - wordl1[1] = 0; - wordl1[2] = 0; - wordl1[3] = 0; + u32 pw_buf0[4]; + u32 pw_buf1[4]; - u32 wordl2[4]; - - wordl2[0] = 0; - wordl2[1] = 0; - wordl2[2] = 0; - wordl2[3] = 0; - - u32 wordl3[4]; - - wordl3[0] = 0; - wordl3[1] = 0; - wordl3[2] = 0; - wordl3[3] = 0; + pw_buf0[0] = pws[gid].i[0]; + pw_buf0[1] = pws[gid].i[1]; + pw_buf0[2] = pws[gid].i[2]; + pw_buf0[3] = pws[gid].i[3]; + pw_buf1[0] = pws[gid].i[4]; + pw_buf1[1] = pws[gid].i[5]; + pw_buf1[2] = pws[gid].i[6]; + pw_buf1[3] = pws[gid].i[7]; const u32 pw_l_len = pws[gid].pw_len; - if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) - { - switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); - } - /** * salt */ @@ -665,79 +626,88 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * loop */ - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 pw_r_len = combs_buf[il_pos].pw_len; - - const u32 pw_len = pw_l_len + pw_r_len; - - u32 wordr0[4]; - - wordr0[0] = combs_buf[il_pos].i[0]; - wordr0[1] = combs_buf[il_pos].i[1]; - wordr0[2] = 0; - wordr0[3] = 0; - - u32 wordr1[4]; + const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos); - wordr1[0] = 0; - wordr1[1] = 0; - wordr1[2] = 0; - wordr1[3] = 0; + const u32x pw_len = pw_l_len + pw_r_len; - u32 wordr2[4]; - - wordr2[0] = 0; - wordr2[1] = 0; - wordr2[2] = 0; - wordr2[3] = 0; - - u32 wordr3[4]; + /** + * concat password candidate + */ - wordr3[0] = 0; - wordr3[1] = 0; - wordr3[2] = 0; - wordr3[3] = 0; + u32x wordl0[4] = { 0 }; + u32x wordl1[4] = { 0 }; + u32x wordl2[4] = { 0 }; + u32x wordl3[4] = { 0 }; + + wordl0[0] = pw_buf0[0]; + wordl0[1] = pw_buf0[1]; + wordl0[2] = pw_buf0[2]; + wordl0[3] = pw_buf0[3]; + wordl1[0] = pw_buf1[0]; + wordl1[1] = pw_buf1[1]; + wordl1[2] = pw_buf1[2]; + wordl1[3] = pw_buf1[3]; + + u32x wordr0[4] = { 0 }; + u32x wordr1[4] = { 0 }; + u32x wordr2[4] = { 0 }; + u32x wordr3[4] = { 0 }; + + wordr0[0] = ix_create_combt (combs_buf, il_pos, 0); + wordr0[1] = ix_create_combt (combs_buf, il_pos, 1); + wordr0[2] = ix_create_combt (combs_buf, il_pos, 2); + wordr0[3] = ix_create_combt (combs_buf, il_pos, 3); + wordr1[0] = ix_create_combt (combs_buf, il_pos, 4); + wordr1[1] = ix_create_combt (combs_buf, il_pos, 5); + wordr1[2] = ix_create_combt (combs_buf, il_pos, 6); + wordr1[3] = ix_create_combt (combs_buf, il_pos, 7); if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le_VV (wordr0, wordr1, wordr2, wordr3, pw_l_len); } + else + { + switch_buffer_by_offset_le_VV (wordl0, wordl1, wordl2, wordl3, pw_r_len); + } + + u32x w0[4]; + + w0[0] = wordl0[0] | wordr0[0]; + w0[1] = wordl0[1] | wordr0[1]; + + if (pw_len > 8) continue; // otherwise it overflows in waldorf function - u32 w0[4]; + /** + * SAP + */ - w0[0] = sapb_trans (wordl0[0] | wordr0[0]); - w0[1] = sapb_trans (wordl0[1] | wordr0[1]); - w0[2] = 0; - w0[3] = 0; + w0[0] = sapb_trans (w0[0]); + w0[1] = sapb_trans (w0[1]); /** * append salt */ u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -936,13 +906,10 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } diff --git a/OpenCL/m07700_a3.cl b/OpenCL/m07700_a3.cl index 212ed1a1d..e3dbff060 100644 --- a/OpenCL/m07700_a3.cl +++ b/OpenCL/m07700_a3.cl @@ -5,6 +5,9 @@ #define _SAPB_ +//too much register pressure +//#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +19,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define GETCHAR(a,p) (((a)[(p) / 4] >> (((p) & 3) * 8)) & 0xff) #define PUTCHAR(a,p,c) ((a)[(p) / 4] = (((a)[(p) / 4] & ~(0xff << (((p) & 3) * 8))) | ((c) << (((p) & 3) * 8)))) @@ -245,28 +246,22 @@ static void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 salt_len = salt_bufs[salt_pos].salt_len; u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -282,11 +277,13 @@ static void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = sapb_trans (bfs_buf[il_pos].i); + const u32x w0r = sapb_trans (ix_create_bft (bfs_buf, il_pos)); + + const u32x w0lr = w0l | w0r; - w0[0] = w0l | w0r; + w0[0] = w0lr; u32 t[16]; @@ -477,13 +474,10 @@ static void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -516,28 +510,22 @@ static void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 salt_len = salt_bufs[salt_pos].salt_len; u32 s0[4]; + u32 s1[4]; + u32 s2[4]; + u32 s3[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = 0; - - u32 s1[4]; - s1[0] = 0; s1[1] = 0; s1[2] = 0; s1[3] = 0; - - u32 s2[4]; - s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - - u32 s3[4]; - s3[0] = 0; s3[1] = 0; s3[2] = 0; @@ -565,11 +553,13 @@ static void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < il_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE) { - const u32 w0r = sapb_trans (bfs_buf[il_pos].i); + const u32x w0r = sapb_trans (ix_create_bft (bfs_buf, il_pos)); + + const u32x w0lr = w0l | w0r; - w0[0] = w0l | w0r; + w0[0] = w0lr; u32 t[16]; @@ -760,13 +750,10 @@ static void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le a ^= c; b ^= d; + c = 0; + d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } }