Prepare new SIMD code for kernel, -m 0, 10, 20, 1000 should work in -a 3 mode and other hopefully stay unaffected

pull/140/head
jsteube 8 years ago
parent 471c10c4f7
commit e3c0c80b6f

File diff suppressed because it is too large Load Diff

@ -5,6 +5,8 @@
#define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h"
#include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#include "OpenCL/simd.c"
#define MD5_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
@ -37,7 +37,7 @@
a -= t; \
}
static void m00000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m00000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -124,18 +124,18 @@ static void m00000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
@ -205,17 +205,11 @@ static void m00000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
MD5_STEP0(MD5_I , b, c, d, a, I_w9c3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
COMPARE_M_SIMD (a, d, c, b);
}
}
static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -312,10 +306,10 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
* reverse
*/
u32 a_rev = digests_buf[digests_offset].digest_buf[0];
u32 b_rev = digests_buf[digests_offset].digest_buf[1];
u32 c_rev = digests_buf[digests_offset].digest_buf[2];
u32 d_rev = digests_buf[digests_offset].digest_buf[3];
u32x a_rev = digests_buf[digests_offset].digest_buf[0];
u32x b_rev = digests_buf[digests_offset].digest_buf[1];
u32x c_rev = digests_buf[digests_offset].digest_buf[2];
u32x d_rev = digests_buf[digests_offset].digest_buf[3];
MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
@ -334,7 +328,7 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
const u32 pre_cd = c_rev ^ d_rev;
const u32x pre_cd = c_rev ^ d_rev;
MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
@ -345,23 +339,23 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
const u32 pre_d = d_rev;
const u32 pre_a = a_rev - w0;
const u32 pre_b = b_rev - (pre_a ^ pre_cd);
const u32 pre_c = c_rev - (pre_a ^ pre_b ^ pre_d);
const u32x pre_d = d_rev;
const u32x pre_a = a_rev - w0;
const u32x pre_b = b_rev - (pre_a ^ pre_cd);
const u32x pre_c = c_rev - (pre_a ^ pre_b ^ pre_d);
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
@ -409,17 +403,13 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP (MD5_H2, d, a, b, c, w0, H_w0c29, MD5S21);
MD5_STEP0(MD5_H1, c, d, a, b, H_w3c2a, MD5S22);
bool q_cond = allx (pre_c != c);
if (q_cond) continue;
if (MATCHES_NONE_VV (pre_c, c)) continue;
MD5_STEP0(MD5_H2, b, c, d, a, H_w6c2b, MD5S23);
MD5_STEP0(MD5_H1, a, b, c, d, H_w9c2c, MD5S20);
MD5_STEP0(MD5_H2, d, a, b, c, H_wcc2d, MD5S21);
bool q_cond2 = allx (pre_d != d);
if (q_cond2) continue;
if (MATCHES_NONE_VV (pre_d, d)) continue;
MD5_STEP0(MD5_H1, c, d, a, b, H_wfc2e, MD5S22);
MD5_STEP0(MD5_H2, b, c, d, a, H_w2c2f, MD5S23);
@ -441,17 +431,11 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
MD5_STEP0(MD5_I , b, c, d, a, I_w9c3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
COMPARE_S_SIMD (a, d, c, b);
}
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -489,7 +473,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m04 (__glo
m00000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -527,7 +511,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m08 (__glo
m00000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -565,7 +549,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_m16 (__glo
m00000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -603,7 +587,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s04 (__glo
m00000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -641,7 +625,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s08 (__glo
m00000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base

@ -5,6 +5,8 @@
#define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h"
#include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#include "OpenCL/simd.c"
#define MD5_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
@ -37,7 +37,7 @@
a -= t; \
}
static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -78,7 +78,7 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
salt_buf3[2] = 0;
salt_buf3[3] = 0;
switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
switch_buffer_by_offset_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
w[ 0] |= salt_buf0[0];
w[ 1] |= salt_buf0[1];
@ -181,18 +181,18 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
@ -262,17 +262,11 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
MD5_STEP0(MD5_I , b, c, d, a, I_w9c3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
COMPARE_M_SIMD (a, d, c, b);
}
}
static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -369,10 +363,10 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
* reverse
*/
u32 a_rev = digests_buf[digests_offset].digest_buf[0];
u32 b_rev = digests_buf[digests_offset].digest_buf[1];
u32 c_rev = digests_buf[digests_offset].digest_buf[2];
u32 d_rev = digests_buf[digests_offset].digest_buf[3];
u32x a_rev = digests_buf[digests_offset].digest_buf[0];
u32x b_rev = digests_buf[digests_offset].digest_buf[1];
u32x c_rev = digests_buf[digests_offset].digest_buf[2];
u32x d_rev = digests_buf[digests_offset].digest_buf[3];
MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33);
MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32);
@ -391,7 +385,7 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31);
MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30);
const u32 pre_cd = c_rev ^ d_rev;
const u32x pre_cd = c_rev ^ d_rev;
MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23);
MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22);
@ -402,23 +396,23 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
const u32 pre_d = d_rev;
const u32 pre_a = a_rev - w0;
const u32 pre_b = b_rev - (pre_a ^ pre_cd);
const u32 pre_c = c_rev - (pre_a ^ pre_b ^ pre_d);
const u32x pre_d = d_rev;
const u32x pre_a = a_rev - w0;
const u32x pre_b = b_rev - (pre_a ^ pre_cd);
const u32x pre_c = c_rev - (pre_a ^ pre_b ^ pre_d);
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00);
MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01);
@ -466,17 +460,13 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP (MD5_H2, d, a, b, c, w0, H_w0c29, MD5S21);
MD5_STEP0(MD5_H1, c, d, a, b, H_w3c2a, MD5S22);
bool q_cond = allx (pre_c != c);
if (q_cond) continue;
if (MATCHES_NONE_VV (pre_c, c)) continue;
MD5_STEP0(MD5_H2, b, c, d, a, H_w6c2b, MD5S23);
MD5_STEP0(MD5_H1, a, b, c, d, H_w9c2c, MD5S20);
MD5_STEP0(MD5_H2, d, a, b, c, H_wcc2d, MD5S21);
bool q_cond2 = allx (pre_d != d);
if (q_cond2) continue;
if (MATCHES_NONE_VV (pre_d, d)) continue;
MD5_STEP0(MD5_H1, c, d, a, b, H_wfc2e, MD5S22);
MD5_STEP0(MD5_H2, b, c, d, a, H_w2c2f, MD5S23);
@ -498,17 +488,11 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32);
MD5_STEP0(MD5_I , b, c, d, a, I_w9c3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
COMPARE_S_SIMD (a, d, c, b);
}
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -546,7 +530,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m04 (__glo
m00010m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -584,7 +568,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m08 (__glo
m00010m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -622,7 +606,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_m16 (__glo
m00010m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -660,7 +644,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s04 (__glo
m00010s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -698,7 +682,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s08 (__glo
m00010s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m00010_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base

@ -5,6 +5,8 @@
#define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h"
#include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#include "OpenCL/simd.c"
static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
@ -71,22 +71,30 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
u32 w0l = w0[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = bfs_buf[il_pos].i;
w0[0] = w0l | w0r;
#if VECT_SIZE == 1
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
#elif VECT_SIZE == 2
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
#elif VECT_SIZE == 4
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
#elif VECT_SIZE == 8
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
#endif
const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
u32 w0_t[4];
u32 w1_t[4];
u32 w2_t[4];
u32 w3_t[4];
u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = w0[0];
w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
@ -128,12 +136,12 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
* md5
*/
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -203,13 +211,7 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
COMPARE_M_SIMD (a, d, c, b);
}
}
@ -276,22 +278,30 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
u32 w0l = w0[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = bfs_buf[il_pos].i;
w0[0] = w0l | w0r;
#if VECT_SIZE == 1
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i);
#elif VECT_SIZE == 2
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i);
#elif VECT_SIZE == 4
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i);
#elif VECT_SIZE == 8
const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i);
#endif
const u32x w0lr = w0l | w0r;
/**
* prepend salt
*/
u32 w0_t[4];
u32 w1_t[4];
u32 w2_t[4];
u32 w3_t[4];
u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = w0[0];
w0_t[0] = w0lr;
w0_t[1] = w0[1];
w0_t[2] = w0[2];
w0_t[3] = w0[3];
@ -333,12 +343,12 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
* md5
*/
u32 tmp2;
u32x tmp2;
u32 a = MD5M_A;
u32 b = MD5M_B;
u32 c = MD5M_C;
u32 d = MD5M_D;
u32x a = MD5M_A;
u32x b = MD5M_B;
u32x c = MD5M_C;
u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -405,21 +415,13 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30);
bool q_cond = allx (search[0] != a);
if (q_cond) continue;
if (MATCHES_NONE_VS (a, search[0])) continue;
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
COMPARE_S_SIMD (a, d, c, b);
}
}

@ -5,6 +5,8 @@
#define _MD4_
#define NEW_SIMD_CODE
#include "include/constants.h"
#include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c"
#include "OpenCL/common.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#include "OpenCL/simd.c"
#define MD4_STEP_REV(f,a,b,c,d,x,t,s) \
{ \
@ -35,7 +35,7 @@
a -= t; \
}
static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -105,18 +105,18 @@ static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
u32 tmp2;
u32x tmp2;
u32 a = MD4M_A;
u32 b = MD4M_B;
u32 c = MD4M_C;
u32 d = MD4M_D;
u32x a = MD4M_A;
u32x b = MD4M_B;
u32x c = MD4M_C;
u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
@ -169,16 +169,11 @@ static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22);
MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
COMPARE_M_SIMD (a, d, c, b);
}
}
static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{
/**
* modifier
@ -258,10 +253,10 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
* reverse
*/
u32 a_rev = digests_buf[digests_offset].digest_buf[0];
u32 b_rev = digests_buf[digests_offset].digest_buf[1];
u32 c_rev = digests_buf[digests_offset].digest_buf[2];
u32 d_rev = digests_buf[digests_offset].digest_buf[3];
u32x a_rev = digests_buf[digests_offset].digest_buf[0];
u32x b_rev = digests_buf[digests_offset].digest_buf[1];
u32x c_rev = digests_buf[digests_offset].digest_buf[2];
u32x d_rev = digests_buf[digests_offset].digest_buf[3];
MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23);
MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22);
@ -280,8 +275,8 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21);
MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20);
const u32 sav_c = c_rev;
const u32 sav_d = d_rev;
const u32x sav_c = c_rev;
const u32x sav_d = d_rev;
MD4_STEP_REV1(MD4_G, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13);
MD4_STEP_REV1(MD4_G, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12);
@ -292,26 +287,26 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
u32 w0l = w[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++)
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0 = w0l | w0r;
const u32x w0 = w0l | w0r;
u32 pre_a = a_rev;
u32 pre_b = b_rev;
u32 pre_c = c_rev;
u32x pre_a = a_rev;
u32x pre_b = b_rev;
u32x pre_c = c_rev;
pre_a = pre_a - w0;
pre_b = pre_b - MD4_G (sav_c, sav_d, pre_a);
pre_c = pre_c - MD4_G (sav_d, pre_a, pre_b);
u32 tmp2;
u32x tmp2;
u32 a = MD4M_A;
u32 b = MD4M_B;
u32 c = MD4M_C;
u32 d = MD4M_D;
u32x a = MD4M_A;
u32x b = MD4M_B;
u32x c = MD4M_C;
u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00);
MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01);
@ -342,16 +337,12 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12);
bool q_cond = allx (pre_c != c);
if (q_cond) continue;
if (MATCHES_NONE_VV (pre_c, c)) continue;
MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13);
MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10);
bool q_cond2 = allx (pre_a != a);
if (q_cond2) continue;
if (MATCHES_NONE_VV (pre_a, a)) continue;
MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11);
MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12);
@ -374,16 +365,11 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22);
MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
COMPARE_S_SIMD (a, d, c, b);
}
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -421,7 +407,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m04 (__glo
m01000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -459,7 +445,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m08 (__glo
m01000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -497,7 +483,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_m16 (__glo
m01000m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -535,7 +521,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s04 (__glo
m01000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
@ -573,7 +559,7 @@ __kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s08 (__glo
m01000s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset);
}
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
__kernel void __attribute__((reqd_work_group_size (64, 1, 1))) m01000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base

@ -0,0 +1,590 @@
// vliw1
#if VECT_SIZE == 1
#define MATCHES_ONE_VV(a,b) ((a) == (b))
#define MATCHES_ONE_VS(a,b) ((a) == (b))
#define COMPARE_S_SIMD(h0,h1,h2,h3) \
{ \
if (((h0) == search[0]) && ((h1) == search[1]) && ((h2) == search[2]) && ((h3) == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos); \
\
d_return_buf[lid] = 1; \
} \
} \
}
#define COMPARE_M_SIMD(h0,h1,h2,h3) \
{ \
const u32 digest_tp0[4] = { h0, h1, h2, h3 }; \
\
if (check (digest_tp0, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp0, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
}
#endif
// vliw2
#if VECT_SIZE == 2
#define MATCHES_ONE_VV(a,b) (((a).s0 == (b).s0) || ((a).s1 == (b).s1))
#define MATCHES_ONE_VS(a,b) (((a).s0 == (b) ) || ((a).s1 == (b) ))
#define COMPARE_S_SIMD(h0,h1,h2,h3) \
{ \
if (((h0).s0 == search[0]) && ((h1).s0 == search[1]) && ((h2).s0 == search[2]) && ((h3).s0 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s1 == search[0]) && ((h1).s1 == search[1]) && ((h2).s1 == search[2]) && ((h3).s1 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
}
#define COMPARE_M_SIMD(h0,h1,h2,h3) \
{ \
const u32 digest_tp0[4] = { h0.s0, h1.s0, h2.s0, h3.s0 }; \
const u32 digest_tp1[4] = { h0.s1, h1.s1, h2.s1, h3.s1 }; \
\
if (check (digest_tp0, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp0, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp1, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp1, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
}
#endif
// vliw4
#if VECT_SIZE == 4
#define MATCHES_ONE_VV(a,b) (((a).s0 == (b).s0) || ((a).s1 == (b).s1) || ((a).s2 == (b).s2) || ((a).s3 == (b).s3))
#define MATCHES_ONE_VS(a,b) (((a).s0 == (b) ) || ((a).s1 == (b) ) || ((a).s2 == (b) ) || ((a).s3 == (b) ))
#define COMPARE_S_SIMD(h0,h1,h2,h3) \
{ \
if (((h0).s0 == search[0]) && ((h1).s0 == search[1]) && ((h2).s0 == search[2]) && ((h3).s0 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s1 == search[0]) && ((h1).s1 == search[1]) && ((h2).s1 == search[2]) && ((h3).s1 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s2 == search[0]) && ((h1).s2 == search[1]) && ((h2).s2 == search[2]) && ((h3).s2 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s3 == search[0]) && ((h1).s3 == search[1]) && ((h2).s3 == search[2]) && ((h3).s3 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \
\
d_return_buf[lid] = 1; \
} \
} \
}
#define COMPARE_M_SIMD(h0,h1,h2,h3) \
{ \
const u32 digest_tp0[4] = { h0.s0, h1.s0, h2.s0, h3.s0 }; \
const u32 digest_tp1[4] = { h0.s1, h1.s1, h2.s1, h3.s1 }; \
const u32 digest_tp2[4] = { h0.s2, h1.s2, h2.s2, h3.s2 }; \
const u32 digest_tp3[4] = { h0.s3, h1.s3, h2.s3, h3.s3 }; \
\
if (check (digest_tp0, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp0, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp1, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp1, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp2, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp2, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp3, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp3, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
}
#endif
// vliw8
#if VECT_SIZE == 8
#define MATCHES_ONE_VV(a,b) (((a).s0 == (b).s0) || ((a).s1 == (b).s1) || ((a).s2 == (b).s2) || ((a).s3 == (b).s3) || ((a).s4 == (b).s4) || ((a).s5 == (b).s5) || ((a).s6 == (b).s6) || ((a).s7 == (b).s7))
#define MATCHES_ONE_VS(a,b) (((a).s0 == (b) ) || ((a).s1 == (b) ) || ((a).s2 == (b) ) || ((a).s3 == (b) ) || ((a).s4 == (b) ) || ((a).s5 == (b) ) || ((a).s6 == (b) ) || ((a).s7 == (b) ))
#define COMPARE_S_SIMD(h0,h1,h2,h3) \
{ \
if (((h0).s0 == search[0]) && ((h1).s0 == search[1]) && ((h2).s0 == search[2]) && ((h3).s0 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s1 == search[0]) && ((h1).s1 == search[1]) && ((h2).s1 == search[2]) && ((h3).s1 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s2 == search[0]) && ((h1).s2 == search[1]) && ((h2).s2 == search[2]) && ((h3).s2 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s3 == search[0]) && ((h1).s3 == search[1]) && ((h2).s3 == search[2]) && ((h3).s3 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \
\
d_return_buf[lid] = 1; \
} \
} \
if (((h0).s4 == search[0]) && ((h1).s4 == search[1]) && ((h2).s4 == search[2]) && ((h3).s4 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 4); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s5 == search[0]) && ((h1).s5 == search[1]) && ((h2).s5 == search[2]) && ((h3).s5 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 5); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s6 == search[0]) && ((h1).s6 == search[1]) && ((h2).s6 == search[2]) && ((h3).s6 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 6); \
\
d_return_buf[lid] = 1; \
} \
} \
\
if (((h0).s7 == search[0]) && ((h1).s7 == search[1]) && ((h2).s7 == search[2]) && ((h3).s7 == search[3])) \
{ \
const u32 final_hash_pos = digests_offset + 0; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 7); \
\
d_return_buf[lid] = 1; \
} \
} \
}
#define COMPARE_M_SIMD(h0,h1,h2,h3) \
{ \
const u32 digest_tp0[4] = { h0.s0, h1.s0, h2.s0, h3.s0 }; \
const u32 digest_tp1[4] = { h0.s1, h1.s1, h2.s1, h3.s1 }; \
const u32 digest_tp2[4] = { h0.s2, h1.s2, h2.s2, h3.s2 }; \
const u32 digest_tp3[4] = { h0.s3, h1.s3, h2.s3, h3.s3 }; \
const u32 digest_tp4[4] = { h0.s4, h1.s4, h2.s4, h3.s4 }; \
const u32 digest_tp5[4] = { h0.s5, h1.s5, h2.s5, h3.s5 }; \
const u32 digest_tp6[4] = { h0.s6, h1.s6, h2.s6, h3.s6 }; \
const u32 digest_tp7[4] = { h0.s7, h1.s7, h2.s7, h3.s7 }; \
\
if (check (digest_tp0, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp0, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 0); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp1, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp1, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 1); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp2, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp2, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 2); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp3, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp3, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 3); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
if (check (digest_tp4, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp4, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 4); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp5, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp5, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 5); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp6, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp6, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 6); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
\
if (check (digest_tp7, \
bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, \
bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, \
bitmap_mask, \
bitmap_shift1, \
bitmap_shift2)) \
{ \
int hash_pos = find_hash (digest_tp7, digests_cnt, &digests_buf[digests_offset]); \
\
if (hash_pos != -1) \
{ \
const u32 final_hash_pos = digests_offset + hash_pos; \
\
if (atomic_add (&hashes_shown[final_hash_pos], 1) == 0) \
{ \
mark_hash (plains_buf, hashes_shown, final_hash_pos, gid, il_pos + 7); \
\
d_return_buf[lid] = 1; \
} \
} \
} \
}
#endif
#define MATCHES_NONE_VV(a,b) !(MATCHES_ONE_VV ((a), (b)))
#define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b)))

@ -8,14 +8,33 @@ typedef ushort u16;
typedef uint u32;
typedef ulong u64;
#define allx(r) r
#ifndef NEW_SIMD_CODE
#undef VECT_SIZE
#define VECT_SIZE 1
#endif
/*
static u32 allx (const u32 r)
{
return r;
}
*/
#if VECT_SIZE == 1
typedef uint u32x;
typedef ulong u64x;
#endif
#if VECT_SIZE == 2
typedef uint2 u32x;
typedef ulong2 u64x;
#endif
#if VECT_SIZE == 4
typedef uint4 u32x;
typedef ulong4 u64x;
#endif
#if VECT_SIZE == 8
typedef uint8 u32x;
typedef ulong8 u64x;
#endif
// this one needs to die
#define allx(r) r
static inline u32 l32_from_64 (u64 a)
{
@ -101,7 +120,7 @@ static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
#endif
#ifdef IS_NV
static inline u32 __byte_perm (const u32 a, const u32 b, const u32 c)
static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -110,6 +129,40 @@ static inline u32 __byte_perm (const u32 a, const u32 b, const u32 c)
return r;
}
static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c)
{
u32x r;
#if VECT_SIZE == 1
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) );
#endif
#if VECT_SIZE == 2
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
#endif
#if VECT_SIZE == 4
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
#endif
#if VECT_SIZE == 8
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6));
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s7) : "r"(a.s7), "r"(b.s7), "r"(c.s7));
#endif
return r;
}
static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -118,6 +171,7 @@ static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
return r;
}
#if CUDA_ARCH >= 350
static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
{
@ -130,7 +184,7 @@ static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
#else
static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
{
return __byte_perm (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
}
#endif
#endif
@ -145,19 +199,48 @@ static inline u32 __bfe (const u32 a, const u32 b, const u32 c)
return BFE (a, b, c);
}
static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
{
return (u32) (((((u64) a) << 32) | (u64) b) >> ((c & 3) * 8));
const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
return (u32) (tmp);
}
static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
{
#if VECT_SIZE == 1
const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
return (u32x) (tmp);
#endif
#if VECT_SIZE == 2
const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1);
#endif
#if VECT_SIZE == 4
const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
#endif
#if VECT_SIZE == 8
const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
#endif
}
#endif
#ifdef IS_AMD
static inline u32 rotr32 (const u32 a, const u32 n)
static inline u32x rotr32 (const u32x a, const u32 n)
{
return rotate (a, 32 - n);
}
static inline u32 rotl32 (const u32 a, const u32 n)
static inline u32x rotl32 (const u32x a, const u32 n)
{
return rotate (a, n);
}
@ -183,65 +266,14 @@ static inline u64 rotl64 (const u64 a, const u32 n)
#endif
#ifdef IS_NV
#if CUDA_ARCH >= 350
/*
this version reduced the number of registers but for some unknown reason the whole kernel become slower.. instruction cache monster?
static inline u32 rotr32 (const u32 a, const u32 n)
{
u32 r;
switch (n & 31)
{
case 0: asm ("shf.r.wrap.b32 %0, %1, %1, 0;" : "=r"(r) : "r"(a)); break;
case 1: asm ("shf.r.wrap.b32 %0, %1, %1, 1;" : "=r"(r) : "r"(a)); break;
case 2: asm ("shf.r.wrap.b32 %0, %1, %1, 2;" : "=r"(r) : "r"(a)); break;
case 3: asm ("shf.r.wrap.b32 %0, %1, %1, 3;" : "=r"(r) : "r"(a)); break;
case 4: asm ("shf.r.wrap.b32 %0, %1, %1, 4;" : "=r"(r) : "r"(a)); break;
case 5: asm ("shf.r.wrap.b32 %0, %1, %1, 5;" : "=r"(r) : "r"(a)); break;
case 6: asm ("shf.r.wrap.b32 %0, %1, %1, 6;" : "=r"(r) : "r"(a)); break;
case 7: asm ("shf.r.wrap.b32 %0, %1, %1, 7;" : "=r"(r) : "r"(a)); break;
case 8: asm ("shf.r.wrap.b32 %0, %1, %1, 8;" : "=r"(r) : "r"(a)); break;
case 9: asm ("shf.r.wrap.b32 %0, %1, %1, 9;" : "=r"(r) : "r"(a)); break;
case 10: asm ("shf.r.wrap.b32 %0, %1, %1, 10;" : "=r"(r) : "r"(a)); break;
case 11: asm ("shf.r.wrap.b32 %0, %1, %1, 11;" : "=r"(r) : "r"(a)); break;
case 12: asm ("shf.r.wrap.b32 %0, %1, %1, 12;" : "=r"(r) : "r"(a)); break;
case 13: asm ("shf.r.wrap.b32 %0, %1, %1, 13;" : "=r"(r) : "r"(a)); break;
case 14: asm ("shf.r.wrap.b32 %0, %1, %1, 14;" : "=r"(r) : "r"(a)); break;
case 15: asm ("shf.r.wrap.b32 %0, %1, %1, 15;" : "=r"(r) : "r"(a)); break;
case 16: asm ("shf.r.wrap.b32 %0, %1, %1, 16;" : "=r"(r) : "r"(a)); break;
case 17: asm ("shf.r.wrap.b32 %0, %1, %1, 17;" : "=r"(r) : "r"(a)); break;
case 18: asm ("shf.r.wrap.b32 %0, %1, %1, 18;" : "=r"(r) : "r"(a)); break;
case 19: asm ("shf.r.wrap.b32 %0, %1, %1, 19;" : "=r"(r) : "r"(a)); break;
case 20: asm ("shf.r.wrap.b32 %0, %1, %1, 20;" : "=r"(r) : "r"(a)); break;
case 21: asm ("shf.r.wrap.b32 %0, %1, %1, 21;" : "=r"(r) : "r"(a)); break;
case 22: asm ("shf.r.wrap.b32 %0, %1, %1, 22;" : "=r"(r) : "r"(a)); break;
case 23: asm ("shf.r.wrap.b32 %0, %1, %1, 23;" : "=r"(r) : "r"(a)); break;
case 24: asm ("shf.r.wrap.b32 %0, %1, %1, 24;" : "=r"(r) : "r"(a)); break;
case 25: asm ("shf.r.wrap.b32 %0, %1, %1, 25;" : "=r"(r) : "r"(a)); break;
case 26: asm ("shf.r.wrap.b32 %0, %1, %1, 26;" : "=r"(r) : "r"(a)); break;
case 27: asm ("shf.r.wrap.b32 %0, %1, %1, 27;" : "=r"(r) : "r"(a)); break;
case 28: asm ("shf.r.wrap.b32 %0, %1, %1, 28;" : "=r"(r) : "r"(a)); break;
case 29: asm ("shf.r.wrap.b32 %0, %1, %1, 29;" : "=r"(r) : "r"(a)); break;
case 30: asm ("shf.r.wrap.b32 %0, %1, %1, 30;" : "=r"(r) : "r"(a)); break;
case 31: asm ("shf.r.wrap.b32 %0, %1, %1, 31;" : "=r"(r) : "r"(a)); break;
}
return r;
}
*/
static inline u32 rotr32 (const u32 a, const u32 n)
static inline u32x rotr32 (const u32x a, const u32 n)
{
u32 r;
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(a), "r"(n));
return r;
return rotate (a, 32 - n);
}
static inline u32 rotl32 (const u32 a, const u32 n)
static inline u32x rotl32 (const u32x a, const u32 n)
{
return rotr32 (a, 32 - n);
return rotate (a, n);
}
static inline u64 rotr64 (const u64 a, const u32 n)
@ -276,36 +308,16 @@ static inline u64 rotl64 (const u64 a, const u32 n)
{
return rotr64 (a, 64 - n);
}
#else
static inline u32 rotr32 (const u32 a, const u32 n)
{
return rotate (a, 32 - n);
}
static inline u32 rotl32 (const u32 a, const u32 n)
{
return rotate (a, n);
}
static inline u64 rotr64 (const u64 a, const u32 n)
{
return rotate (a, (u64) 64 - n);
}
static inline u64 rotl64 (const u64 a, const u32 n)
{
return rotate (a, (u64) n);
}
#endif
#endif
#ifdef IS_GENERIC
static inline u32 rotr32 (const u32 a, const u32 n)
static inline u32x rotr32 (const u32x a, const u32x n)
{
return rotate (a, 32 - n);
}
static inline u32 rotl32 (const u32 a, const u32 n)
static inline u32x rotl32 (const u32x a, const u32x n)
{
return rotate (a, n);
}
@ -323,68 +335,244 @@ static inline u64 rotl64 (const u64 a, const u32 n)
#ifdef IS_NV
#if CUDA_ARCH >= 500
static inline u32 lut3_2d (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_39 (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_59 (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_96 (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_e4 (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_e8 (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
static inline u32 lut3_ca (const u32 a, const u32 b, const u32 c)
static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
{
u32 r;
u32x r;
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#if VECT_SIZE == 1
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
#endif
#if VECT_SIZE == 2
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
#endif
#if VECT_SIZE == 4
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
#endif
#if VECT_SIZE == 8
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
#endif
return r;
}
#endif
#endif

@ -58,6 +58,11 @@ File.: Host
Desc.: Added option --opencl-device-types to select specific OpenCL device types
Issue: 2
Type.: Feature
File.: Host
Desc.: Added option --opencl-vector-width to override automatically selected vector-width size
Issue: 2
Type.: Feature
File.: Host
Desc.: Implemented a new feature that allows to quit at next restore point update (and disable it)

@ -175,6 +175,7 @@ _oclHashcat ()
local ATTACK_MODES="0 1 3 6 7"
local OUTFILE_FORMATS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15"
local OPENCL_DEVICE_TYPES="1 2 3"
local OPENCL_VECTOR_WIDTH="1 2 4 8"
local BENCHMARK_MODE="0 1"
local DEBUG_MODE="1 2 3 4"
local WORKLOAD_PROFILE="1 2 3"
@ -185,8 +186,8 @@ _oclHashcat ()
local BUILD_IN_CHARSETS='?l ?u ?d ?a ?b ?s'
local SHORT_OPTS="-m -a -V -v -h -b -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -i -s -l"
local LONG_OPTS="--hash-type --attack-mode --version --help --eula --quiet --benchmark --benchmark-mode --hex-salt --hex-wordlist --hex-charset --force --status --status-timer --status-automat --loopback --weak-hash-threshold --markov-hcstat --markov-disable --markov-classic --markov-threshold --runtime --session --restore --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --separator --show --left --username --remove --remove-timer --potfile-disable --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --opencl-devices --opencl-platforms --opencl-device-types --workload-profile --kernel-accel --kernel-loops --gpu-temp-disable --gpu-temp-abort --gpu-temp-retain --powertune-enable --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --rules-cleanup --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --truecrypt-keyfiles"
local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --benchmark-mode --status-timer --weak-hash-threshold --markov-hcstat --markov-threshold --runtime --session --timer --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --opencl-devices --opencl-platforms --opencl-device-types --workload-profile --kernel-accel --kernel-loops --gpu-temp-abort --gpu-temp-retain -disable --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment-min --increment-max --scrypt-tmto --truecrypt-keyfiles"
local LONG_OPTS="--hash-type --attack-mode --version --help --eula --quiet --benchmark --benchmark-mode --hex-salt --hex-wordlist --hex-charset --force --status --status-timer --status-automat --loopback --weak-hash-threshold --markov-hcstat --markov-disable --markov-classic --markov-threshold --runtime --session --restore --restore-disable --outfile --outfile-format --outfile-autohex-disable --outfile-check-timer --outfile-check-dir --separator --show --left --username --remove --remove-timer --potfile-disable --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --opencl-devices --opencl-platforms --opencl-device-types --opencl-vector-width --workload-profile --kernel-accel --kernel-loops --gpu-temp-disable --gpu-temp-abort --gpu-temp-retain --powertune-enable --skip --limit --keyspace --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --rules-cleanup --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment --increment-min --increment-max --logfile-disable --scrypt-tmto --truecrypt-keyfiles"
local OPTIONS="-m -a -t -o -p -c -d -w -n -u -j -k -r -g -1 -2 -3 -4 -s -l --hash-type --attack-mode --benchmark-mode --status-timer --weak-hash-threshold --markov-hcstat --markov-threshold --runtime --session --timer --outfile --outfile-format --outfile-check-timer --outfile-check-dir --separator --remove-timer --debug-mode --debug-file --induction-dir --segment-size --bitmap-min --bitmap-max --cpu-affinity --opencl-devices --opencl-platforms --opencl-device-types --opencl-vector-width --workload-profile --kernel-accel --kernel-loops --gpu-temp-abort --gpu-temp-retain -disable --skip --limit --rule-left --rule-right --rules-file --generate-rules --generate-rules-func-min --generate-rules-func-max --generate-rules-seed --custom-charset1 --custom-charset2 --custom-charset3 --custom-charset4 --increment-min --increment-max --scrypt-tmto --truecrypt-keyfiles"
COMPREPLY=()
local cur="${COMP_WORDS[COMP_CWORD]}"
@ -271,6 +272,11 @@ _oclHashcat ()
return 0
;;
--opencl-vector-width)
COMPREPLY=($(compgen -W "${OPENCL_VECTOR_WIDTH}" -- ${cur}))
return 0
;;
--opencl-platforms)
local icd_list=$(ls -1 /etc/OpenCL/vendors/*.icd 2> /dev/null)

@ -836,6 +836,8 @@ struct __hc_device_param
u64 device_global_mem;
u32 device_maxclock_frequency;
uint vector_width;
uint kernel_threads;
uint kernel_accel;
uint kernel_power; // these both are based on their _user counterpart

@ -80,6 +80,7 @@ const uint RESTORE_MIN = 210;
#define POWERTUNE_ENABLE 0
#define LOGFILE_DISABLE 0
#define SCRYPT_TMTO 0
#define OPENCL_VECTOR_WIDTH 0
#define WL_MODE_STDIN 1
#define WL_MODE_FILE 2
@ -388,6 +389,7 @@ const char *USAGE_BIG[] =
" --opencl-platforms=STR OpenCL platforms to use, separate with comma",
" -d, --opencl-devices=STR OpenCL devices to use, separate with comma",
" --opencl-device-types=STR OpenCL device-types to use, separate with comma, see references below",
" --opencl-vector-width=NUM OpenCL vector-width (either 1, 2, 4 or 8), overrides value from device query",
" -w, --workload-profile=NUM Enable a specific workload profile, see references below",
" -n, --kernel-accel=NUM Workload tuning: 1, 8, 40, 80, 160",
" -u, --kernel-loops=NUM Workload fine-tuning: 8 - 1024",
@ -5110,6 +5112,7 @@ int main (int argc, char **argv)
char *opencl_devices = NULL;
char *opencl_platforms = NULL;
char *opencl_device_types = NULL;
uint opencl_vector_width = OPENCL_VECTOR_WIDTH;
char *truecrypt_keyfiles = NULL;
uint workload_profile = WORKLOAD_PROFILE;
uint kernel_accel = KERNEL_ACCEL;
@ -5185,6 +5188,7 @@ int main (int argc, char **argv)
#define IDX_OPENCL_DEVICES 'd'
#define IDX_OPENCL_PLATFORMS 0xff72
#define IDX_OPENCL_DEVICE_TYPES 0xff73
#define IDX_OPENCL_VECTOR_WIDTH 0xff74
#define IDX_WORKLOAD_PROFILE 'w'
#define IDX_KERNEL_ACCEL 'n'
#define IDX_KERNEL_LOOPS 'u'
@ -5266,6 +5270,7 @@ int main (int argc, char **argv)
{"opencl-devices", required_argument, 0, IDX_OPENCL_DEVICES},
{"opencl-platforms", required_argument, 0, IDX_OPENCL_PLATFORMS},
{"opencl-device-types", required_argument, 0, IDX_OPENCL_DEVICE_TYPES},
{"opencl-vector-width", required_argument, 0, IDX_OPENCL_VECTOR_WIDTH},
{"workload-profile", required_argument, 0, IDX_WORKLOAD_PROFILE},
{"kernel-accel", required_argument, 0, IDX_KERNEL_ACCEL},
{"kernel-loops", required_argument, 0, IDX_KERNEL_LOOPS},
@ -5570,6 +5575,8 @@ int main (int argc, char **argv)
case IDX_OPENCL_PLATFORMS: opencl_platforms = optarg; break;
case IDX_OPENCL_DEVICE_TYPES:
opencl_device_types = optarg; break;
case IDX_OPENCL_VECTOR_WIDTH:
opencl_vector_width = atoi (optarg); break;
case IDX_WORKLOAD_PROFILE: workload_profile = atoi (optarg); break;
case IDX_KERNEL_ACCEL: kernel_accel = atoi (optarg);
kernel_accel_chgd = 1; break;
@ -5862,6 +5869,13 @@ int main (int argc, char **argv)
return (-1);
}
if ((opencl_vector_width != 0) && (opencl_vector_width != 1) && (opencl_vector_width != 2) && (opencl_vector_width != 4) && (opencl_vector_width != 8))
{
log_error ("ERROR: opencl-vector-width %i not allowed", opencl_vector_width);
return (-1);
}
if (show == 1 || left == 1)
{
attack_mode = ATTACK_MODE_NONE;
@ -6416,6 +6430,7 @@ int main (int argc, char **argv)
logfile_top_string (opencl_devices);
logfile_top_string (opencl_platforms);
logfile_top_string (opencl_device_types);
logfile_top_uint (opencl_vector_width);
logfile_top_string (induction_dir);
logfile_top_string (markov_hcstat);
logfile_top_string (outfile);
@ -10257,6 +10272,8 @@ int main (int argc, char **argv)
continue;
}
if (plain_len >= 255) continue;
memcpy (pot_ptr->plain_buf, plain_buf, plain_len);
pot_ptr->plain_len = plain_len;
@ -12491,6 +12508,30 @@ int main (int argc, char **argv)
// max_compute_units
cl_uint vector_width;
if (attack_mode == ATTACK_MODE_BF)
{
if (opencl_vector_width == OPENCL_VECTOR_WIDTH)
{
hc_clGetDeviceInfo (device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof (vector_width), &vector_width, NULL);
}
else
{
vector_width = opencl_vector_width;
}
}
else
{
vector_width = 1;
}
if (vector_width > 8) vector_width = 8;
device_param->vector_width = vector_width;
// max_compute_units
cl_uint device_processors;
hc_clGetDeviceInfo (device_param->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (device_processors), &device_processors, NULL);
@ -12541,9 +12582,9 @@ int main (int argc, char **argv)
char *device_name_chksum = (char *) mymalloc (INFOSZ);
#if __x86_64__
snprintf (device_name_chksum, INFOSZ - 1, "%u-%u-%s-%s-%s-%u", 64, device_param->vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, COMPTIME);
snprintf (device_name_chksum, INFOSZ - 1, "%u-%u-%u-%s-%s-%s-%u", 64, device_param->vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, COMPTIME);
#else
snprintf (device_name_chksum, INFOSZ - 1, "%u-%u-%s-%s-%s-%u", 32, device_param->vendor_id, device_param->device_name, device_param->device_version, device_param->driver_version, COMPTIME);
snprintf (device_name_chksum, INFOSZ - 1, "%u-%u-%u-%s-%s-%s-%u", 32, device_param->vendor_id, device_param->vector_width, device_param->device_name, device_param->device_version, device_param->driver_version, COMPTIME);
#endif
uint device_name_digest[4];
@ -13090,7 +13131,7 @@ int main (int argc, char **argv)
// CPU still need lots of workitems, don't know why...
// for testing phase, lets start with this
kernel_accel = 1;
// kernel_accel = 1;
}
uint kernel_power = device_processors * kernel_threads * kernel_accel;
@ -13310,7 +13351,7 @@ int main (int argc, char **argv)
// we don't have sm_* on vendors not NV but it doesn't matter
sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor);
sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d -DVECT_SIZE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width);
/**
* main kernel

@ -19,7 +19,7 @@ NEVER_CRACK="11600"
SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000"
OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable -d 1 --weak-hash-threshold=0"
OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 1,2"
OUTD="test_$(date +%s)"

Loading…
Cancel
Save