Prepare NEW_SIMD_MODE for -a 1 kernels

pull/237/head
Jens Steube 8 years ago
parent 4338d71d62
commit 7190dcf855

@ -5,6 +5,8 @@
#define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h"
#include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#include "OpenCL/simd.c"
__kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
@ -68,86 +68,72 @@ __kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
append_0x80_2x4_S (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
* loop
*/
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{
const u32 pw_r_len = combs_buf[il_pos].pw_len;
const u32 pw_len = pw_l_len + pw_r_len;
u32 wordr0[4];
wordr0[0] = combs_buf[il_pos].i[0];
wordr0[1] = combs_buf[il_pos].i[1];
wordr0[2] = combs_buf[il_pos].i[2];
wordr0[3] = combs_buf[il_pos].i[3];
const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 wordr1[4];
const u32x pw_len = pw_l_len + pw_r_len;
wordr1[0] = combs_buf[il_pos].i[4];
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
u32x w0[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
u32 w1[4];
u32x w1[4];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
u32 w2[4];
u32x w2[4];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
u32 w3[4];
u32x w3[4];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8;
w3[3] = 0;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -217,12 +203,7 @@ __kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
COMPARE_M_SIMD (a, d, c, b);
}
}
@ -282,9 +263,9 @@ __kernel void m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
append_0x80_2x4_S (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/**
@ -303,77 +284,63 @@ __kernel void m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop
*/
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{
const u32 pw_r_len = combs_buf[il_pos].pw_len;
const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len;
const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4];
u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0];
wordr0[1] = combs_buf[il_pos].i[1];
wordr0[2] = combs_buf[il_pos].i[2];
wordr0[3] = combs_buf[il_pos].i[3];
u32 wordr1[4];
wordr1[0] = combs_buf[il_pos].i[4];
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
}
u32 w0[4];
u32x w0[4];
w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3];
u32 w1[4];
u32x w1[4];
w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3];
u32 w2[4];
u32x w2[4];
w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3];
u32 w3[4];
u32x w3[4];
w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8;
w3[3] = 0;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -440,20 +407,13 @@ __kernel void m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
bool q_cond = allx (search[0] != a);
if (q_cond) continue;
if (MATCHES_NONE_VS (a, search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
COMPARE_S_SIMD (a, d, c, b);
}
}

@ -118,7 +118,7 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -356,7 +356,7 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -118,7 +118,7 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -356,7 +356,7 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -254,7 +254,7 @@ static void m00050m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -383,7 +383,7 @@ static void m00050s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -275,7 +275,7 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0rl = w0r | w0l;
@ -390,7 +390,7 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0rl = w0r | w0l;

@ -135,7 +135,7 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -425,7 +425,7 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -135,7 +135,7 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -425,7 +425,7 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -274,7 +274,7 @@ static void m00150m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -389,7 +389,7 @@ static void m00150s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -309,7 +309,7 @@ static void m00160m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -422,7 +422,7 @@ static void m00160s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -135,7 +135,7 @@ static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -385,7 +385,7 @@ static void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -135,7 +135,7 @@ static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -385,7 +385,7 @@ static void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -264,7 +264,7 @@ static void m01450m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -379,7 +379,7 @@ static void m01450s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -299,7 +299,7 @@ static void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -412,7 +412,7 @@ static void m01460s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -257,7 +257,7 @@ static void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -457,7 +457,7 @@ static void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -257,7 +257,7 @@ static void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -457,7 +457,7 @@ static void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -295,7 +295,7 @@ static void m01750m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -415,7 +415,7 @@ static void m01750s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -330,7 +330,7 @@ static void m01760m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -448,7 +448,7 @@ static void m01760s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -69,7 +69,7 @@ static void m02610m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -329,7 +329,7 @@ static void m02610s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -71,7 +71,7 @@ static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -416,7 +416,7 @@ static void m02710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -70,7 +70,7 @@ static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -414,7 +414,7 @@ static void m02810s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -88,7 +88,7 @@ static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -396,7 +396,7 @@ static void m03710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -77,7 +77,7 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -335,7 +335,7 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -68,7 +68,7 @@ static void m04310m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -328,7 +328,7 @@ static void m04310s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -55,7 +55,7 @@ static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -326,7 +326,7 @@ static void m04400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -51,7 +51,7 @@ static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -357,7 +357,7 @@ static void m04500s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -52,7 +52,7 @@ static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -340,7 +340,7 @@ static void m04700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -142,7 +142,7 @@ static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);
@ -362,7 +362,7 @@ static void m04800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
w0_t[0] = w0l | (w0r << 8);
w0_t[1] = w1l | (w0r >> 24);

@ -129,7 +129,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -437,7 +437,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -113,7 +113,7 @@ static void m05000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -273,7 +273,7 @@ static void m05000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -43,7 +43,7 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -171,7 +171,7 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -229,7 +229,7 @@ static void m05300m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -394,7 +394,7 @@ static void m05300s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -263,7 +263,7 @@ static void m05400m (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -428,7 +428,7 @@ static void m05400s (__local u32 *w_s, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -313,7 +313,7 @@ static void m05600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -530,7 +530,7 @@ static void m05600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -240,7 +240,7 @@ static void m06000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -312,7 +312,7 @@ static void m06000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -1363,7 +1363,7 @@ static void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -1423,7 +1423,7 @@ static void m06100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -729,7 +729,7 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -917,7 +917,7 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -256,7 +256,7 @@ static void m07300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -380,7 +380,7 @@ static void m07300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -86,7 +86,7 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -594,7 +594,7 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -50,7 +50,7 @@ static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -238,7 +238,7 @@ static void m08100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -267,7 +267,7 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -521,7 +521,7 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -207,7 +207,7 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -435,7 +435,7 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -303,7 +303,7 @@ static void m09700m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
@ -806,7 +806,7 @@ static void m09700s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;

@ -269,7 +269,7 @@ static void m09710m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
@ -395,7 +395,7 @@ static void m09710s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;

@ -490,7 +490,7 @@ static void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -588,7 +588,7 @@ static void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -314,7 +314,7 @@ static void m09800m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
@ -490,7 +490,7 @@ static void m09800s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;

@ -299,7 +299,7 @@ static void m09810m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
@ -403,7 +403,7 @@ static void m09810s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;

@ -180,7 +180,7 @@ static void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -294,7 +294,7 @@ static void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -275,7 +275,7 @@ static void m10400m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
w0[0] = w0l | w0r;
@ -428,7 +428,7 @@ static void m10400s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
w0[0] = w0l | w0r;

@ -152,7 +152,7 @@ static void m10410m (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;
@ -206,7 +206,7 @@ static void m10410s (__local RC4_KEY *rc4_keys, u32 w0[4], u32 w1[4], u32 w2[4],
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = w0r_create_bft (bfs_buf, il_pos);
const u32 w0r = ix_create_bft (bfs_buf, il_pos);
const u32 w0lr = w0l | w0r;

@ -171,7 +171,7 @@ static void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -312,7 +312,7 @@ static void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -71,7 +71,7 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -354,7 +354,7 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -98,7 +98,7 @@ static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -397,7 +397,7 @@ static void m11100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -49,7 +49,7 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -492,7 +492,7 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -877,7 +877,7 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -1409,7 +1409,7 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -2022,7 +2022,7 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -2652,7 +2652,7 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -3363,7 +3363,7 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -3895,7 +3895,7 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -4508,7 +4508,7 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -5138,7 +5138,7 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -2314,7 +2314,7 @@ static void m11700m (__local u64 (*s_sbob_sl64)[256], u32 w[16], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -2409,7 +2409,7 @@ static void m11700s (__local u64 (*s_sbob_sl64)[256], u32 w[16], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -2314,7 +2314,7 @@ static void m11800m (__local u64 (*s_sbob_sl64)[256], u32 w[16], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -2409,7 +2409,7 @@ static void m11800s (__local u64 (*s_sbob_sl64)[256], u32 w[16], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -67,7 +67,7 @@ static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;
@ -370,7 +370,7 @@ static void m12600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32x w0r = w0r_create_bft (bfs_buf, il_pos);
const u32x w0r = ix_create_bft (bfs_buf, il_pos);
const u32x w0lr = w0l | w0r;

@ -1171,43 +1171,79 @@
// attack-mode 0
static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos)
static inline u32x ix_create_bft (__global bf_t *bfs_buf, const u32 il_pos)
{
#if VECT_SIZE == 1
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
const u32x ix = (u32x) (bfs_buf[il_pos + 0].i);
#elif VECT_SIZE == 2
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
#elif VECT_SIZE == 4
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
#elif VECT_SIZE == 8
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
#elif VECT_SIZE == 16
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i);
const u32x ix = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i, bfs_buf[il_pos + 8].i, bfs_buf[il_pos + 9].i, bfs_buf[il_pos + 10].i, bfs_buf[il_pos + 11].i, bfs_buf[il_pos + 12].i, bfs_buf[il_pos + 13].i, bfs_buf[il_pos + 14].i, bfs_buf[il_pos + 15].i);
#endif
return w0r;
return ix;
}
// attack-mode 1
static inline u32x pwlenx_create_combt (__global comb_t *combs_buf, const u32 il_pos)
{
#if VECT_SIZE == 1
const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len);
#elif VECT_SIZE == 2
const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len);
#elif VECT_SIZE == 4
const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len);
#elif VECT_SIZE == 8
const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len);
#elif VECT_SIZE == 16
const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len, combs_buf[il_pos + 1].pw_len, combs_buf[il_pos + 2].pw_len, combs_buf[il_pos + 3].pw_len, combs_buf[il_pos + 4].pw_len, combs_buf[il_pos + 5].pw_len, combs_buf[il_pos + 6].pw_len, combs_buf[il_pos + 7].pw_len, combs_buf[il_pos + 8].pw_len, combs_buf[il_pos + 9].pw_len, combs_buf[il_pos + 10].pw_len, combs_buf[il_pos + 11].pw_len, combs_buf[il_pos + 12].pw_len, combs_buf[il_pos + 13].pw_len, combs_buf[il_pos + 14].pw_len, combs_buf[il_pos + 15].pw_len);
#endif
return pw_lenx;
}
static inline u32x ix_create_combt (__global comb_t *combs_buf, const u32 il_pos, const int idx)
{
#if VECT_SIZE == 1
const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]);
#elif VECT_SIZE == 2
const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx]);
#elif VECT_SIZE == 4
const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx]);
#elif VECT_SIZE == 8
const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx]);
#elif VECT_SIZE == 16
const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx], combs_buf[il_pos + 1].i[idx], combs_buf[il_pos + 2].i[idx], combs_buf[il_pos + 3].i[idx], combs_buf[il_pos + 4].i[idx], combs_buf[il_pos + 5].i[idx], combs_buf[il_pos + 6].i[idx], combs_buf[il_pos + 7].i[idx], combs_buf[il_pos + 8].i[idx], combs_buf[il_pos + 9].i[idx], combs_buf[il_pos + 10].i[idx], combs_buf[il_pos + 11].i[idx], combs_buf[il_pos + 12].i[idx], combs_buf[il_pos + 13].i[idx], combs_buf[il_pos + 14].i[idx], combs_buf[il_pos + 15].i[idx]);
#endif
return ix;
}
#if VECT_SIZE == 1
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 1) + 0].var[(idx)])
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 1) + 0].var[(idx)])
#elif VECT_SIZE == 2
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 2) + 0].var[(idx)], (arr)[((gid) * 2) + 1].var[(idx)])
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 2) + 0].var[(idx)], (arr)[((gid) * 2) + 1].var[(idx)])
#elif VECT_SIZE == 4
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)])
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 4) + 0].var[(idx)], (arr)[((gid) * 4) + 1].var[(idx)], (arr)[((gid) * 4) + 2].var[(idx)], (arr)[((gid) * 4) + 3].var[(idx)])
#elif VECT_SIZE == 8
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)])
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)])
#elif VECT_SIZE == 16
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 8) + 0].var[(idx)], (arr)[((gid) * 8) + 1].var[(idx)], (arr)[((gid) * 8) + 2].var[(idx)], (arr)[((gid) * 8) + 3].var[(idx)], (arr)[((gid) * 8) + 4].var[(idx)], (arr)[((gid) * 8) + 5].var[(idx)], (arr)[((gid) * 8) + 6].var[(idx)], (arr)[((gid) * 8) + 7].var[(idx)], (arr)[((gid) * 8) + 8].var[(idx)], (arr)[((gid) * 8) + 9].var[(idx)], (arr)[((gid) * 8) + 10].var[(idx)], (arr)[((gid) * 8) + 11].var[(idx)], (arr)[((gid) * 8) + 12].var[(idx)], (arr)[((gid) * 8) + 13].var[(idx)], (arr)[((gid) * 8) + 14].var[(idx)], (arr)[((gid) * 8) + 15].var[(idx)])
#define packv(arr,var,gid,idx) (u32x) ((arr)[((gid) * 16) + 0].var[(idx)], (arr)[((gid) * 16) + 1].var[(idx)], (arr)[((gid) * 16) + 2].var[(idx)], (arr)[((gid) * 16) + 3].var[(idx)], (arr)[((gid) * 16) + 4].var[(idx)], (arr)[((gid) * 16) + 5].var[(idx)], (arr)[((gid) * 16) + 6].var[(idx)], (arr)[((gid) * 16) + 7].var[(idx)], (arr)[((gid) * 16) + 8].var[(idx)], (arr)[((gid) * 16) + 9].var[(idx)], (arr)[((gid) * 16) + 10].var[(idx)], (arr)[((gid) * 16) + 11].var[(idx)], (arr)[((gid) * 16) + 12].var[(idx)], (arr)[((gid) * 16) + 13].var[(idx)], (arr)[((gid) * 16) + 14].var[(idx)], (arr)[((gid) * 16) + 15].var[(idx)])
#endif
#if VECT_SIZE == 1
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val;
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 1) + 0].var[(idx)] = val;
#elif VECT_SIZE == 2
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 2) + 0].var[(idx)] = val.s0; (arr)[((gid) * 2) + 1].var[(idx)] = val.s1;
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 2) + 0].var[(idx)] = val.s0; (arr)[((gid) * 2) + 1].var[(idx)] = val.s1;
#elif VECT_SIZE == 4
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 4) + 0].var[(idx)] = val.s0; (arr)[((gid) * 4) + 1].var[(idx)] = val.s1; (arr)[((gid) * 4) + 2].var[(idx)] = val.s2; (arr)[((gid) * 4) + 3].var[(idx)] = val.s3;
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 4) + 0].var[(idx)] = val.s0; (arr)[((gid) * 4) + 1].var[(idx)] = val.s1; (arr)[((gid) * 4) + 2].var[(idx)] = val.s2; (arr)[((gid) * 4) + 3].var[(idx)] = val.s3;
#elif VECT_SIZE == 8
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7;
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7;
#elif VECT_SIZE == 16
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 8) + 0].var[(idx)] = val.s0; (arr)[((gid) * 8) + 1].var[(idx)] = val.s1; (arr)[((gid) * 8) + 2].var[(idx)] = val.s2; (arr)[((gid) * 8) + 3].var[(idx)] = val.s3; (arr)[((gid) * 8) + 4].var[(idx)] = val.s4; (arr)[((gid) * 8) + 5].var[(idx)] = val.s5; (arr)[((gid) * 8) + 6].var[(idx)] = val.s6; (arr)[((gid) * 8) + 7].var[(idx)] = val.s7; (arr)[((gid) * 8) + 8].var[(idx)] = val.s8; (arr)[((gid) * 8) + 9].var[(idx)] = val.s9; (arr)[((gid) * 8) + 10].var[(idx)] = val.sa; (arr)[((gid) * 8) + 11].var[(idx)] = val.sb; (arr)[((gid) * 8) + 12].var[(idx)] = val.sc; (arr)[((gid) * 8) + 13].var[(idx)] = val.sd; (arr)[((gid) * 8) + 14].var[(idx)] = val.se; (arr)[((gid) * 8) + 15].var[(idx)] = val.sf;
#define unpackv(arr,var,gid,idx,val) (arr)[((gid) * 16) + 0].var[(idx)] = val.s0; (arr)[((gid) * 16) + 1].var[(idx)] = val.s1; (arr)[((gid) * 16) + 2].var[(idx)] = val.s2; (arr)[((gid) * 16) + 3].var[(idx)] = val.s3; (arr)[((gid) * 16) + 4].var[(idx)] = val.s4; (arr)[((gid) * 16) + 5].var[(idx)] = val.s5; (arr)[((gid) * 16) + 6].var[(idx)] = val.s6; (arr)[((gid) * 16) + 7].var[(idx)] = val.s7; (arr)[((gid) * 16) + 8].var[(idx)] = val.s8; (arr)[((gid) * 16) + 9].var[(idx)] = val.s9; (arr)[((gid) * 16) + 10].var[(idx)] = val.sa; (arr)[((gid) * 16) + 11].var[(idx)] = val.sb; (arr)[((gid) * 16) + 12].var[(idx)] = val.sc; (arr)[((gid) * 16) + 13].var[(idx)] = val.sd; (arr)[((gid) * 16) + 14].var[(idx)] = val.se; (arr)[((gid) * 16) + 15].var[(idx)] = val.sf;
#endif

@ -220,12 +220,12 @@ static uint default_benchmark_algorithms[NUM_DEFAULT_BENCHMARK_ALGORITHMS] =
101,
111,
1711,
3000, // broken in osx
3000,
1000,
1100,
2100,
12800,
1500, // broken in osx
1500,
12400,
500,
3200,
@ -278,7 +278,7 @@ static uint default_benchmark_algorithms[NUM_DEFAULT_BENCHMARK_ALGORITHMS] =
10410,
10500,
10600,
10700, // broken in osx
10700,
9000,
5200,
6800,
@ -2877,7 +2877,7 @@ static void autotune (hc_device_param_t *device_param)
u32 steps_accel[STEPS_ACCEL_CNT];
u32 steps_loops[STEPS_LOOPS_CNT];
for (int i = 1; i < STEPS_ACCEL_CNT; i++)
for (int i = 0; i < STEPS_ACCEL_CNT; i++)
{
steps_accel[i] = 1 << i;
}
@ -2900,62 +2900,66 @@ static void autotune (hc_device_param_t *device_param)
u32 kernel_loops_tmp;
for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp >= kernel_loops_min; kernel_loops_tmp >>= 1)
for (kernel_loops_tmp = kernel_loops_max; kernel_loops_tmp > kernel_loops_min; kernel_loops_tmp >>= 1)
{
const double exec_ms = try_run (device_param, kernel_accel_min, kernel_loops_tmp, 1);
if (exec_ms < target_ms) break;
if (kernel_loops_tmp == kernel_loops_min) break;
}
// kernel-accel
double e_best = 0;
for (int i = 0; i < STEPS_ACCEL_CNT; i++)
if (kernel_accel_min < kernel_accel_max)
{
const u32 kernel_accel_try = steps_accel[i];
double e_best = 0;
for (int i = 0; i < STEPS_ACCEL_CNT; i++)
{
const u32 kernel_accel_try = steps_accel[i];
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
if (kernel_accel_try < kernel_accel_min) continue;
if (kernel_accel_try > kernel_accel_max) break;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_tmp, 1);
if (exec_ms > target_ms) break;
if (exec_ms > target_ms) break;
const double e = kernel_accel_try / exec_ms;
const double e = kernel_accel_try / exec_ms;
if (e > e_best)
{
kernel_accel = kernel_accel_try;
if (e > e_best)
{
kernel_accel = kernel_accel_try;
e_best = e;
e_best = e;
}
}
}
// kernel-loops final
e_best = 0;
for (int i = 0; i < STEPS_LOOPS_CNT; i++)
if (kernel_loops_min < kernel_loops_max)
{
const u32 kernel_loops_try = steps_loops[i];
double e_best = 0;
if (kernel_loops_try < kernel_loops_min) continue;
if (kernel_loops_try > kernel_loops_max) break;
for (int i = 0; i < STEPS_LOOPS_CNT; i++)
{
const u32 kernel_loops_try = steps_loops[i];
const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
if (kernel_loops_try < kernel_loops_min) continue;
if (kernel_loops_try > kernel_loops_max) break;
if (exec_ms > target_ms) break;
const double exec_ms = try_run (device_param, kernel_accel, kernel_loops_try, 1);
const double e = kernel_loops_try / exec_ms;
if (exec_ms > target_ms) break;
if (e > e_best)
{
kernel_loops = kernel_loops_try;
const double e = kernel_loops_try / exec_ms;
if (e > e_best)
{
kernel_loops = kernel_loops_try;
e_best = e;
e_best = e;
}
}
}
@ -2970,54 +2974,64 @@ static void autotune (hc_device_param_t *device_param)
// reset
u32 kernel_accel_try = kernel_accel;
u32 kernel_loops_try = kernel_loops;
for (int i = 0; i < 2; i++)
if (kernel_accel_min < kernel_accel_max)
{
kernel_accel_try >>= 1;
kernel_loops_try <<= 1;
u32 kernel_accel_try = kernel_accel;
u32 kernel_loops_try = kernel_loops;
if (kernel_accel_try < kernel_accel_min) break;
if (kernel_loops_try > kernel_loops_max) break;
for (int i = 0; i < 2; i++)
{
kernel_accel_try >>= 1;
kernel_loops_try <<= 1;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
if (kernel_accel_try < kernel_accel_min) break;
if (kernel_loops_try > kernel_loops_max) break;
if (exec_ms < exec_best)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
exec_best = exec_ms;
if (exec_ms < exec_best)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
exec_best = exec_ms;
}
}
kernel_accel = kernel_accel_best;
kernel_loops = kernel_loops_best;
}
// reset
kernel_accel_try = kernel_accel;
kernel_loops_try = kernel_loops;
for (int i = 0; i < 2; i++)
if (kernel_loops_min < kernel_loops_max)
{
kernel_accel_try <<= 1;
kernel_loops_try >>= 1;
u32 kernel_accel_try = kernel_accel;
u32 kernel_loops_try = kernel_loops;
for (int i = 0; i < 2; i++)
{
kernel_accel_try <<= 1;
kernel_loops_try >>= 1;
if (kernel_accel_try > kernel_accel_max) break;
if (kernel_loops_try < kernel_loops_min) break;
if (kernel_accel_try > kernel_accel_max) break;
if (kernel_loops_try < kernel_loops_min) break;
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
const double exec_ms = try_run (device_param, kernel_accel_try, kernel_loops_try, 1);
if (exec_ms < exec_best)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
if (exec_ms < exec_best)
{
kernel_accel_best = kernel_accel_try;
kernel_loops_best = kernel_loops_try;
exec_best = exec_ms;
exec_best = exec_ms;
}
}
}
kernel_accel = kernel_accel_best;
kernel_loops = kernel_loops_best;
kernel_accel = kernel_accel_best;
kernel_loops = kernel_loops_best;
}
// reset timer
@ -13431,21 +13445,6 @@ int main (int argc, char **argv)
#endif // HAVE_ADK
#endif // HAVE_HWMON
#ifdef OSX
if (hash_mode == 3000 || hash_mode == 1500 || hash_mode == 10700)
{
if (force == 0)
{
log_info ("");
log_info ("Warning: Hash mode %d is not stable with OSX.", hash_mode);
log_info ("You can use --force to override this but do not post error reports if you do so");
log_info ("");
continue;
}
}
#endif
#ifdef DEBUG
if (benchmark == 1) log_info ("Hashmode: %d", data.hash_mode);
#endif

@ -25,7 +25,7 @@ NEVER_CRACK="11600"
SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000"
OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0"
OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 -u 1024 -n 128"
OUTD="test_$(date +%s)"

Loading…
Cancel
Save