1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-05-25 18:28:49 +00:00

Zero pws_buf before reuse

This commit is contained in:
Jens Steube 2016-02-22 21:20:16 +01:00
parent 6c10ca5853
commit b409e5e9e1
92 changed files with 6466 additions and 12082 deletions

View File

@ -59,18 +59,26 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -91,33 +99,27 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* append salt * append salt
*/ */
u32x s0[4]; u32x s0[4] = { 0 };
u32x s1[4] = { 0 };
u32x s2[4] = { 0 };
u32x s3[4] = { 0 };
s0[0] = salt_buf0[0]; s0[0] = salt_buf0[0];
s0[1] = salt_buf0[1]; s0[1] = salt_buf0[1];
s0[2] = salt_buf0[2]; s0[2] = salt_buf0[2];
s0[3] = salt_buf0[3]; s0[3] = salt_buf0[3];
u32x s1[4];
s1[0] = salt_buf1[0]; s1[0] = salt_buf1[0];
s1[1] = salt_buf1[1]; s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2]; s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3]; s1[3] = salt_buf1[3];
s2[0] = salt_buf2[0];
u32x s2[4]; s2[1] = salt_buf2[1];
s2[2] = salt_buf2[2];
s2[0] = 0; s2[3] = salt_buf2[3];
s2[1] = 0; s3[0] = salt_buf3[0];
s2[2] = 0; s3[1] = salt_buf3[1];
s2[3] = 0; s3[2] = salt_buf3[2];
s3[3] = salt_buf3[3];
u32x s3[4];
s3[0] = 0;
s3[1] = 0;
s3[2] = 0;
s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
@ -127,24 +129,19 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w0[1] |= s0[1]; w0[1] |= s0[1];
w0[2] |= s0[2]; w0[2] |= s0[2];
w0[3] |= s0[3]; w0[3] |= s0[3];
w1[0] |= s1[0]; w1[0] |= s1[0];
w1[1] |= s1[1]; w1[1] |= s1[1];
w1[2] |= s1[2]; w1[2] |= s1[2];
w1[3] |= s1[3]; w1[3] |= s1[3];
w2[0] |= s2[0]; w2[0] |= s2[0];
w2[1] |= s2[1]; w2[1] |= s2[1];
w2[2] |= s2[2]; w2[2] |= s2[2];
w2[3] |= s2[3]; w2[3] |= s2[3];
w3[0] |= s3[0]; w3[0] |= s3[0];
w3[1] |= s3[1]; w3[1] |= s3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;
append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len);
/** /**
* md5 * md5
*/ */
@ -271,18 +268,26 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -315,33 +320,27 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* append salt * append salt
*/ */
u32x s0[4]; u32x s0[4] = { 0 };
u32x s1[4] = { 0 };
u32x s2[4] = { 0 };
u32x s3[4] = { 0 };
s0[0] = salt_buf0[0]; s0[0] = salt_buf0[0];
s0[1] = salt_buf0[1]; s0[1] = salt_buf0[1];
s0[2] = salt_buf0[2]; s0[2] = salt_buf0[2];
s0[3] = salt_buf0[3]; s0[3] = salt_buf0[3];
u32x s1[4];
s1[0] = salt_buf1[0]; s1[0] = salt_buf1[0];
s1[1] = salt_buf1[1]; s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2]; s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3]; s1[3] = salt_buf1[3];
s2[0] = salt_buf2[0];
u32x s2[4]; s2[1] = salt_buf2[1];
s2[2] = salt_buf2[2];
s2[0] = 0; s2[3] = salt_buf2[3];
s2[1] = 0; s3[0] = salt_buf3[0];
s2[2] = 0; s3[1] = salt_buf3[1];
s2[3] = 0; s3[2] = salt_buf3[2];
s3[3] = salt_buf3[3];
u32x s3[4];
s3[0] = 0;
s3[1] = 0;
s3[2] = 0;
s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); switch_buffer_by_offset_le (s0, s1, s2, s3, out_len);
@ -351,24 +350,19 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w0[1] |= s0[1]; w0[1] |= s0[1];
w0[2] |= s0[2]; w0[2] |= s0[2];
w0[3] |= s0[3]; w0[3] |= s0[3];
w1[0] |= s1[0]; w1[0] |= s1[0];
w1[1] |= s1[1]; w1[1] |= s1[1];
w1[2] |= s1[2]; w1[2] |= s1[2];
w1[3] |= s1[3]; w1[3] |= s1[3];
w2[0] |= s2[0]; w2[0] |= s2[0];
w2[1] |= s2[1]; w2[1] |= s2[1];
w2[2] |= s2[2]; w2[2] |= s2[2];
w2[3] |= s2[3]; w2[3] |= s2[3];
w3[0] |= s3[0]; w3[0] |= s3[0];
w3[1] |= s3[1]; w3[1] |= s3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;
append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len);
/** /**
* md5 * md5
*/ */

View File

@ -56,15 +56,25 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -138,8 +148,8 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = 0; w3[2] = wordl3[2] | wordr3[2];
w3[3] = 0; w3[3] = wordl3[3] | wordr3[3];
/** /**
* append salt * append salt
@ -158,6 +168,14 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s1[1] = salt_buf1[1]; s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2]; s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3]; s1[3] = salt_buf1[3];
s2[0] = salt_buf2[0];
s2[1] = salt_buf2[1];
s2[2] = salt_buf2[2];
s2[3] = salt_buf2[3];
s3[0] = salt_buf3[0];
s3[1] = salt_buf3[1];
s3[2] = salt_buf3[2];
s3[3] = salt_buf3[3];
switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len);
@ -305,15 +323,25 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -399,8 +427,8 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = 0; w3[2] = wordl3[2] | wordr3[2];
w3[3] = 0; w3[3] = wordl3[3] | wordr3[3];
/** /**
* append salt * append salt
@ -419,6 +447,14 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s1[1] = salt_buf1[1]; s1[1] = salt_buf1[1];
s1[2] = salt_buf1[2]; s1[2] = salt_buf1[2];
s1[3] = salt_buf1[3]; s1[3] = salt_buf1[3];
s2[0] = salt_buf2[0];
s2[1] = salt_buf2[1];
s2[2] = salt_buf2[2];
s2[3] = salt_buf2[3];
s3[0] = salt_buf3[0];
s3[1] = salt_buf3[1];
s3[2] = salt_buf3[2];
s3[3] = salt_buf3[3];
switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_VV (s0, s1, s2, s3, pw_len);

View File

@ -51,32 +51,28 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = 0; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = 0; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4]; const u32 salt_len = salt_bufs[salt_pos].salt_len;
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
@ -97,11 +93,10 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
w[14] |= salt_buf3[2]; w[14] |= salt_buf3[2];
w[15] |= salt_buf3[3]; w[15] |= salt_buf3[3];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
const u32 pw_salt_len = pw_len + salt_len; const u32 pw_salt_len = pw_len + salt_len;
w[14] = pw_salt_len * 8; w[14] = pw_salt_len * 8;
w[15] = 0;
/** /**
* base * base

View File

@ -59,18 +59,26 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -125,10 +133,19 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1_t[1] |= salt_buf1[1]; w1_t[1] |= salt_buf1[1];
w1_t[2] |= salt_buf1[2]; w1_t[2] |= salt_buf1[2];
w1_t[3] |= salt_buf1[3]; w1_t[3] |= salt_buf1[3];
w2_t[0] |= salt_buf2[0];
w2_t[1] |= salt_buf2[1];
w2_t[2] |= salt_buf2[2];
w2_t[3] |= salt_buf2[3];
w3_t[0] |= salt_buf3[0];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8; w3_t[2] = out_salt_len * 8;
w3_t[3] = 0;
/** /**
* md5 * md5
@ -256,18 +273,26 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -334,10 +359,19 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1_t[1] |= salt_buf1[1]; w1_t[1] |= salt_buf1[1];
w1_t[2] |= salt_buf1[2]; w1_t[2] |= salt_buf1[2];
w1_t[3] |= salt_buf1[3]; w1_t[3] |= salt_buf1[3];
w2_t[0] |= salt_buf2[0];
w2_t[1] |= salt_buf2[1];
w2_t[2] |= salt_buf2[2];
w2_t[3] |= salt_buf2[3];
w3_t[0] |= salt_buf3[0];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8; w3_t[2] = out_salt_len * 8;
w3_t[3] = 0;
/** /**
* md5 * md5

View File

@ -56,15 +56,25 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -157,12 +167,12 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[1] |= salt_buf1[1]; w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2]; w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3]; w1[3] |= salt_buf1[3];
w2[0] |= 0; w2[0] |= salt_buf2[0];
w2[1] |= 0; w2[1] |= salt_buf2[1];
w2[2] |= 0; w2[2] |= salt_buf2[2];
w2[3] |= 0; w2[3] |= salt_buf2[3];
w3[0] |= 0; w3[0] |= salt_buf3[0];
w3[1] |= 0; w3[1] |= salt_buf3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;
@ -290,15 +300,25 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -403,12 +423,12 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[1] |= salt_buf1[1]; w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2]; w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3]; w1[3] |= salt_buf1[3];
w2[0] |= 0; w2[0] |= salt_buf2[0];
w2[1] |= 0; w2[1] |= salt_buf2[1];
w2[2] |= 0; w2[2] |= salt_buf2[2];
w2[3] |= 0; w2[3] |= salt_buf2[3];
w3[0] |= 0; w3[0] |= salt_buf3[0];
w3[1] |= 0; w3[1] |= salt_buf3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;

View File

@ -20,7 +20,7 @@
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c" #include "OpenCL/simd.c"
static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) static void m00020m (u32 t0[4], u32 t1[4], u32 t2[4], u32 t3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset)
{ {
/** /**
* modifier * modifier
@ -34,32 +34,26 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
u32 salt_buf2[4]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[0] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf2[1] = 0; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf2[2] = 0; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf2[3] = 0; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4];
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -74,22 +68,22 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
u32 w2_t[4]; u32 w2_t[4];
u32 w3_t[4]; u32 w3_t[4];
w0_t[0] = w0[0]; w0_t[0] = t0[0];
w0_t[1] = w0[1]; w0_t[1] = t0[1];
w0_t[2] = w0[2]; w0_t[2] = t0[2];
w0_t[3] = w0[3]; w0_t[3] = t0[3];
w1_t[0] = w1[0]; w1_t[0] = t1[0];
w1_t[1] = w1[1]; w1_t[1] = t1[1];
w1_t[2] = w1[2]; w1_t[2] = t1[2];
w1_t[3] = w1[3]; w1_t[3] = t1[3];
w2_t[0] = w2[0]; w2_t[0] = t2[0];
w2_t[1] = w2[1]; w2_t[1] = t2[1];
w2_t[2] = w2[2]; w2_t[2] = t2[2];
w2_t[3] = w2[3]; w2_t[3] = t2[3];
w3_t[0] = w3[0]; w3_t[0] = t3[0];
w3_t[1] = w3[1]; w3_t[1] = t3[1];
w3_t[2] = w3[2]; w3_t[2] = t3[2];
w3_t[3] = w3[3]; w3_t[3] = t3[3];
switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len);
@ -114,7 +108,7 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
* loop * loop
*/ */
u32 w0l = w0[0]; u32 w0l = t0[0];
for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE)
{ {
@ -143,27 +137,29 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
overwrite_at_le (wx, w0lr, salt_len); overwrite_at_le (wx, w0lr, salt_len);
u32x w0_t[4]; u32x w0[4];
u32x w1_t[4]; u32x w1[4];
u32x w2_t[4]; u32x w2[4];
u32x w3_t[4]; u32x w3[4];
w0_t[0] = wx[ 0]; w0[0] = wx[ 0];
w0_t[1] = wx[ 1]; w0[1] = wx[ 1];
w0_t[2] = wx[ 2]; w0[2] = wx[ 2];
w0_t[3] = wx[ 3]; w0[3] = wx[ 3];
w1_t[0] = wx[ 4]; w1[0] = wx[ 4];
w1_t[1] = wx[ 5]; w1[1] = wx[ 5];
w1_t[2] = wx[ 6]; w1[2] = wx[ 6];
w1_t[3] = wx[ 7]; w1[3] = wx[ 7];
w2_t[0] = wx[ 8]; w2[0] = wx[ 8];
w2_t[1] = wx[ 9]; w2[1] = wx[ 9];
w2_t[2] = wx[10]; w2[2] = wx[10];
w2_t[3] = wx[11]; w2[3] = wx[11];
w3_t[0] = wx[12]; w3[0] = wx[12];
w3_t[1] = wx[13]; w3[1] = wx[13];
w3_t[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3_t[3] = 0; w3[3] = 0;
append_0x80_4x4 (w0, w1, w2, w3, pw_salt_len);
/** /**
* md5 * md5
@ -174,73 +170,73 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
u32x c = MD5M_C; u32x c = MD5M_C;
u32x d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13);
MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b); COMPARE_M_SIMD (a, d, c, b);
} }
@ -272,32 +268,26 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
u32 salt_buf2[4]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[0] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf2[1] = 0; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf2[2] = 0; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf2[3] = 0; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4];
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;

View File

@ -61,18 +61,26 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -147,12 +155,8 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] |= s2[3]; w2_t[3] |= s2[3];
w3_t[0] |= s3[0]; w3_t[0] |= s3[0];
w3_t[1] |= s3[1]; w3_t[1] |= s3[1];
w3_t[2] |= s3[2]; w3_t[2] = out_salt_len * 8;
w3_t[3] |= s3[3]; w3_t[3] = 0;
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8;
/** /**
* md5 * md5
@ -280,18 +284,26 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -378,12 +390,8 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] |= s2[3]; w2_t[3] |= s2[3];
w3_t[0] |= s3[0]; w3_t[0] |= s3[0];
w3_t[1] |= s3[1]; w3_t[1] |= s3[1];
w3_t[2] |= s3[2]; w3_t[2] = out_salt_len * 8;
w3_t[3] |= s3[3]; w3_t[3] = 0;
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8;
/** /**
* md5 * md5

View File

@ -58,15 +58,25 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -312,15 +322,25 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;

View File

@ -51,32 +51,28 @@ static void m00030m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = 0; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = 0; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4]; const u32 salt_len = salt_bufs[salt_pos].salt_len;
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len);
@ -97,11 +93,10 @@ static void m00030m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k
w[14] |= salt_buf3[2]; w[14] |= salt_buf3[2];
w[15] |= salt_buf3[3]; w[15] |= salt_buf3[3];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
const u32 pw_salt_len = pw_len + salt_len; const u32 pw_salt_len = pw_len + salt_len;
w[14] = pw_salt_len * 8; w[14] = pw_salt_len * 8;
w[15] = 0;
/** /**
* base * base

View File

@ -59,18 +59,26 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -111,10 +119,19 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1_t[1] |= salt_buf1[1]; w1_t[1] |= salt_buf1[1];
w1_t[2] |= salt_buf1[2]; w1_t[2] |= salt_buf1[2];
w1_t[3] |= salt_buf1[3]; w1_t[3] |= salt_buf1[3];
w2_t[0] |= salt_buf2[0];
w2_t[1] |= salt_buf2[1];
w2_t[2] |= salt_buf2[2];
w2_t[3] |= salt_buf2[3];
w3_t[0] |= salt_buf3[0];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8; w3_t[2] = out_salt_len * 8;
w3_t[3] = 0;
/** /**
* md5 * md5
@ -242,18 +259,26 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -306,10 +331,19 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1_t[1] |= salt_buf1[1]; w1_t[1] |= salt_buf1[1];
w1_t[2] |= salt_buf1[2]; w1_t[2] |= salt_buf1[2];
w1_t[3] |= salt_buf1[3]; w1_t[3] |= salt_buf1[3];
w2_t[0] |= salt_buf2[0];
w2_t[1] |= salt_buf2[1];
w2_t[2] |= salt_buf2[2];
w2_t[3] |= salt_buf2[3];
w3_t[0] |= salt_buf3[0];
w3_t[1] |= salt_buf3[1];
w3_t[2] |= salt_buf3[2];
w3_t[3] |= salt_buf3[3];
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, out_salt_len);
w3_t[2] = out_salt_len * 8; w3_t[2] = out_salt_len * 8;
w3_t[3] = 0;
/** /**
* md5 * md5

View File

@ -58,15 +58,25 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -164,12 +174,12 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[1] |= salt_buf1[1]; w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2]; w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3]; w1[3] |= salt_buf1[3];
w2[0] |= 0; w2[0] |= salt_buf2[0];
w2[1] |= 0; w2[1] |= salt_buf2[1];
w2[2] |= 0; w2[2] |= salt_buf2[2];
w2[3] |= 0; w2[3] |= salt_buf2[3];
w3[0] |= 0; w3[0] |= salt_buf3[0];
w3[1] |= 0; w3[1] |= salt_buf3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;
@ -298,15 +308,25 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4]; u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -416,12 +436,12 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[1] |= salt_buf1[1]; w1[1] |= salt_buf1[1];
w1[2] |= salt_buf1[2]; w1[2] |= salt_buf1[2];
w1[3] |= salt_buf1[3]; w1[3] |= salt_buf1[3];
w2[0] |= 0; w2[0] |= salt_buf2[0];
w2[1] |= 0; w2[1] |= salt_buf2[1];
w2[2] |= 0; w2[2] |= salt_buf2[2];
w2[3] |= 0; w2[3] |= salt_buf2[3];
w3[0] |= 0; w3[0] |= salt_buf3[0];
w3[1] |= 0; w3[1] |= salt_buf3[1];
w3[2] = pw_salt_len * 8; w3[2] = pw_salt_len * 8;
w3[3] = 0; w3[3] = 0;

View File

@ -34,32 +34,26 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
u32 salt_buf2[4]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[0] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf2[1] = 0; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf2[2] = 0; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf2[3] = 0; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4];
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;
@ -272,32 +266,26 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
u32 salt_buf2[4]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[0] = 0; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf2[1] = 0; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf2[2] = 0; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf2[3] = 0; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
u32 salt_buf3[4];
salt_buf3[0] = 0;
salt_buf3[1] = 0;
salt_buf3[2] = 0;
salt_buf3[3] = 0;
const u32 salt_len = salt_bufs[salt_pos].salt_len; const u32 salt_len = salt_bufs[salt_pos].salt_len;

View File

@ -242,28 +242,22 @@ __kernel void m00050_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
u32 salt_buf3[4];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
@ -391,28 +385,22 @@ __kernel void m00050_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
u32 salt_buf3[4];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];

View File

@ -217,28 +217,22 @@ static void m00050m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
u32 salt_buf3[4];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
@ -334,28 +328,22 @@ static void m00050s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
u32 salt_buf2[4];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8]; salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9]; salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10]; salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11]; salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
u32 salt_buf3[4];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12]; salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13]; salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14]; salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];

View File

@ -242,50 +242,54 @@ __kernel void m00060_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
/** /**
* pads * pads
*/ */
u32x w0_t[4]; u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
w2_t[0] = salt_buf2[0];
u32x w2_t[4]; w2_t[1] = salt_buf2[1];
w2_t[2] = salt_buf2[2];
w2_t[0] = 0; w2_t[3] = salt_buf2[3];
w2_t[1] = 0; w3_t[0] = salt_buf3[0];
w2_t[2] = 0; w3_t[1] = salt_buf3[1];
w2_t[3] = 0; w3_t[2] = salt_buf3[2];
w3_t[3] = salt_buf3[3];
u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
u32x ipad[4]; u32x ipad[4];
u32x opad[4]; u32x opad[4];
@ -377,50 +381,54 @@ __kernel void m00060_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
/** /**
* pads * pads
*/ */
u32x w0_t[4]; u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
w2_t[0] = salt_buf2[0];
u32x w2_t[4]; w2_t[1] = salt_buf2[1];
w2_t[2] = salt_buf2[2];
w2_t[0] = 0; w2_t[3] = salt_buf2[3];
w2_t[1] = 0; w3_t[0] = salt_buf3[0];
w2_t[2] = 0; w3_t[1] = salt_buf3[1];
w2_t[3] = 0; w3_t[2] = salt_buf3[2];
w3_t[3] = salt_buf3[3];
u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
u32x ipad[4]; u32x ipad[4];
u32x opad[4]; u32x opad[4];

View File

@ -235,32 +235,26 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32x w0_t[4]; u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
w2_t[0] = salt_buf2[0];
u32x w2_t[4]; w2_t[1] = salt_buf2[1];
w2_t[2] = salt_buf2[2];
w2_t[0] = 0; w2_t[3] = salt_buf2[3];
w2_t[1] = 0; w3_t[0] = salt_buf3[0];
w2_t[2] = 0; w3_t[1] = salt_buf3[1];
w2_t[3] = 0; w3_t[2] = salt_buf3[2];
w3_t[3] = salt_buf3[3];
u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
u32x ipad[4]; u32x ipad[4];
u32x opad[4]; u32x opad[4];
@ -320,50 +314,54 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
u32 salt_buf2[4];
u32 salt_buf3[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[ 0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[ 1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[ 2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[ 3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[ 4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[ 5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[ 6];
salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7]; salt_buf1[3] = salt_bufs[salt_pos].salt_buf[ 7];
salt_buf2[0] = salt_bufs[salt_pos].salt_buf[ 8];
salt_buf2[1] = salt_bufs[salt_pos].salt_buf[ 9];
salt_buf2[2] = salt_bufs[salt_pos].salt_buf[10];
salt_buf2[3] = salt_bufs[salt_pos].salt_buf[11];
salt_buf3[0] = salt_bufs[salt_pos].salt_buf[12];
salt_buf3[1] = salt_bufs[salt_pos].salt_buf[13];
salt_buf3[2] = salt_bufs[salt_pos].salt_buf[14];
salt_buf3[3] = salt_bufs[salt_pos].salt_buf[15];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
/** /**
* pads * pads
*/ */
u32x w0_t[4]; u32x w0_t[4];
u32x w1_t[4];
u32x w2_t[4];
u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
w2_t[0] = salt_buf2[0];
u32x w2_t[4]; w2_t[1] = salt_buf2[1];
w2_t[2] = salt_buf2[2];
w2_t[0] = 0; w2_t[3] = salt_buf2[3];
w2_t[1] = 0; w3_t[0] = salt_buf3[0];
w2_t[2] = 0; w3_t[1] = salt_buf3[1];
w2_t[3] = 0; w3_t[2] = salt_buf3[2];
w3_t[3] = salt_buf3[3];
u32x w3_t[4];
w3_t[0] = 0;
w3_t[1] = 0;
w3_t[2] = 0;
w3_t[3] = 0;
u32x ipad[4]; u32x ipad[4];
u32x opad[4]; u32x opad[4];

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,39 +72,25 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -166,32 +129,32 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[2] = wordl0[2] | wordr0[2] | s0[2];
w0[3] = wordl0[3] | wordr0[3] | s0[3]; w0[3] = wordl0[3] | wordr0[3] | s0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0] | s1[0]; w1[0] = wordl1[0] | wordr1[0] | s1[0];
w1[1] = wordl1[1] | wordr1[1] | s1[1]; w1[1] = wordl1[1] | wordr1[1] | s1[1];
w1[2] = wordl1[2] | wordr1[2] | s1[2]; w1[2] = wordl1[2] | wordr1[2] | s1[2];
w1[3] = wordl1[3] | wordr1[3] | s1[3]; w1[3] = wordl1[3] | wordr1[3] | s1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0] | s2[0]; w2[0] = wordl2[0] | wordr2[0] | s2[0];
w2[1] = wordl2[1] | wordr2[1] | s2[1]; w2[1] = wordl2[1] | wordr2[1] | s2[1];
w2[2] = wordl2[2] | wordr2[2] | s2[2]; w2[2] = wordl2[2] | wordr2[2] | s2[2];
w2[3] = wordl2[3] | wordr2[3] | s2[3]; w2[3] = wordl2[3] | wordr2[3] | s2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0] | s3[0]; w3[0] = wordl3[0] | wordr3[0] | s3[0];
w3[1] = wordl3[1] | wordr3[1] | s3[1]; w3[1] = wordl3[1] | wordr3[1] | s3[1];
@ -204,28 +167,28 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -323,13 +286,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -357,54 +314,31 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -428,45 +362,31 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -505,32 +425,32 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[2] = wordl0[2] | wordr0[2] | s0[2];
w0[3] = wordl0[3] | wordr0[3] | s0[3]; w0[3] = wordl0[3] | wordr0[3] | s0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0] | s1[0]; w1[0] = wordl1[0] | wordr1[0] | s1[0];
w1[1] = wordl1[1] | wordr1[1] | s1[1]; w1[1] = wordl1[1] | wordr1[1] | s1[1];
w1[2] = wordl1[2] | wordr1[2] | s1[2]; w1[2] = wordl1[2] | wordr1[2] | s1[2];
w1[3] = wordl1[3] | wordr1[3] | s1[3]; w1[3] = wordl1[3] | wordr1[3] | s1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0] | s2[0]; w2[0] = wordl2[0] | wordr2[0] | s2[0];
w2[1] = wordl2[1] | wordr2[1] | s2[1]; w2[1] = wordl2[1] | wordr2[1] | s2[1];
w2[2] = wordl2[2] | wordr2[2] | s2[2]; w2[2] = wordl2[2] | wordr2[2] | s2[2];
w2[3] = wordl2[3] | wordr2[3] | s2[3]; w2[3] = wordl2[3] | wordr2[3] | s2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0] | s3[0]; w3[0] = wordl3[0] | wordr3[0] | s3[0];
w3[1] = wordl3[1] | wordr3[1] | s3[1]; w3[1] = wordl3[1] | wordr3[1] | s3[1];
@ -543,28 +463,28 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -658,20 +578,14 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,43 +72,35 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -154,12 +123,12 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
@ -214,11 +183,11 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -316,13 +285,7 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_M_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -350,54 +313,31 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -421,49 +361,41 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -486,12 +418,12 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
@ -546,11 +478,11 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -645,19 +577,13 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_S_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
// no unicode yet // no unicode yet
@ -38,54 +38,31 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -97,39 +74,25 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -172,10 +135,10 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -194,10 +157,10 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -244,11 +207,11 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -346,13 +309,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_M_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -380,54 +337,31 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -451,45 +385,31 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -532,10 +452,10 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -554,10 +474,10 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -604,11 +524,11 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -703,19 +623,13 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_S_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
// no unicode yet // no unicode yet
@ -38,54 +38,31 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -97,39 +74,25 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -142,10 +105,10 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -164,10 +127,10 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -208,11 +171,11 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -310,13 +273,7 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_M_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -344,54 +301,31 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -415,45 +349,31 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -466,10 +386,10 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -488,10 +408,10 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -532,11 +452,11 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -631,19 +551,13 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_S_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -253,41 +253,20 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -312,67 +291,53 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -383,28 +348,28 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -437,12 +402,7 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -470,41 +430,20 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -541,67 +480,53 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -612,28 +537,28 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -666,12 +591,7 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -253,41 +253,20 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -347,67 +326,53 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -437,12 +402,7 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -470,41 +430,20 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -576,67 +515,53 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -666,12 +591,7 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,110 +36,71 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -150,28 +111,28 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -274,25 +235,11 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d += SHA1M_D; d += SHA1M_D;
c += SHA1M_C; c += SHA1M_C;
{ COMPARE_M_SIMD (a, e, d, c);
const u32 r0 = a;
const u32 r1 = e;
const u32 r2 = d;
const u32 r3 = c;
#include COMPARE_M
}
a &= 0x00000fff; a &= 0x00000fff;
{ COMPARE_M_SIMD (a, e, d, c);
const u32 r0 = a;
const u32 r1 = e;
const u32 r2 = d;
const u32 r3 = c;
#include COMPARE_M
}
} }
} }
@ -320,43 +267,20 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -373,69 +297,53 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -446,28 +354,28 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -570,25 +478,11 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d += SHA1M_D; d += SHA1M_D;
c += SHA1M_C; c += SHA1M_C;
{ COMPARE_S_SIMD (a, e, d, c);
const u32 r0 = a;
const u32 r1 = e;
const u32 r2 = d;
const u32 r3 = c;
#include COMPARE_S
}
a &= 0x00000fff; a &= 0x00000fff;
{ COMPARE_S_SIMD (a, e, d, c);
const u32 r0 = a;
const u32 r1 = e;
const u32 r2 = d;
const u32 r3 = c;
#include COMPARE_S
}
} }
} }

View File

@ -5,6 +5,8 @@
#define _MYSQL323_ #define _MYSQL323_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,85 +36,50 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 0] = wordl0[0] | wordr0[0];
w_t[ 1] = wordl0[1] | wordr0[1]; w_t[ 1] = wordl0[1] | wordr0[1];
@ -133,8 +98,10 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w_t[14] = wordl3[2] | wordr3[2]; w_t[14] = wordl3[2] | wordr3[2];
w_t[15] = 0; w_t[15] = 0;
u32 a = MYSQL323_A; u32x a = MYSQL323_A;
u32 b = MYSQL323_B; u32x b = MYSQL323_B;
u32x c = 0;
u32x d = 0;
u32 add = 7; u32 add = 7;
@ -181,12 +148,7 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
a &= 0x7fffffff; a &= 0x7fffffff;
b &= 0x7fffffff; b &= 0x7fffffff;
const u32 r0 = a; COMPARE_M_SIMD (a, b, c, d);
const u32 r1 = b;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_M
} }
} }
@ -214,41 +176,20 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -265,46 +206,32 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 0] = wordl0[0] | wordr0[0];
w_t[ 1] = wordl0[1] | wordr0[1]; w_t[ 1] = wordl0[1] | wordr0[1];
@ -323,8 +250,10 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w_t[14] = wordl3[2] | wordr3[2]; w_t[14] = wordl3[2] | wordr3[2];
w_t[15] = 0; w_t[15] = 0;
u32 a = MYSQL323_A; u32x a = MYSQL323_A;
u32 b = MYSQL323_B; u32x b = MYSQL323_B;
u32x c = 0;
u32x d = 0;
u32 add = 7; u32 add = 7;
@ -371,12 +300,7 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
a &= 0x7fffffff; a &= 0x7fffffff;
b &= 0x7fffffff; b &= 0x7fffffff;
const u32 r0 = a; COMPARE_S_SIMD (a, b, c, d);
const u32 r1 = b;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,110 +36,71 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -150,28 +111,28 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -394,13 +355,7 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -428,43 +383,20 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -481,75 +413,59 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -560,28 +476,28 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -800,20 +716,14 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (d, e, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD4_ #define _MD4_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,118 +36,81 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01);
@ -199,13 +162,7 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -233,43 +190,19 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -286,77 +219,63 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0[1], MD4C00, MD4S01);
@ -408,13 +327,7 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3[3], MD4C02, MD4S23);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD4_ #define _MD4_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,128 +36,91 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -209,13 +172,7 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -243,43 +200,20 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -296,87 +230,73 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -428,13 +348,7 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD4_ #define _MD4_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,43 +36,20 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -104,87 +81,73 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -314,13 +277,7 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -348,43 +305,20 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -428,87 +362,73 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -636,20 +556,12 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD4_STEP (MD4_H , b, c, d, a, w3_t[1], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3_t[1], MD4C02, MD4S23);
MD4_STEP (MD4_H , a, b, c, d, w0_t[3], MD4C02, MD4S20); MD4_STEP (MD4_H , a, b, c, d, w0_t[3], MD4C02, MD4S20);
bool q_cond = allx (search[0] != a); if (MATCHES_NONE_VS (a, search[0])) continue;
if (q_cond) continue;
MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21);
MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22);
MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,92 +36,53 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -144,31 +105,31 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA256 * SHA256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -238,13 +199,7 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -272,43 +227,20 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -325,51 +257,35 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -392,31 +308,31 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA256 * SHA256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -486,13 +402,7 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,39 +72,25 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -166,14 +129,14 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
@ -198,31 +161,31 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -292,13 +255,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -326,54 +283,31 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -397,39 +331,25 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -468,14 +388,14 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
@ -500,31 +420,31 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -594,13 +514,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,43 +72,35 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -154,7 +123,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
@ -173,31 +142,31 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -267,13 +236,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -301,54 +264,31 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -372,43 +312,35 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -431,7 +363,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
@ -450,31 +382,31 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -544,13 +476,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,43 +72,35 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -186,10 +155,10 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t2[4]; u32x w0_t2[4];
u32 w1_t2[4]; u32x w1_t2[4];
u32 w2_t2[4]; u32x w2_t2[4];
u32 w3_t2[4]; u32x w3_t2[4];
make_unicode (w0, w0_t2, w1_t2); make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2); make_unicode (w1, w2_t2, w3_t2);
@ -217,31 +186,31 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0_t2[0]); u32x w0_t = swap32 (w0_t2[0]);
u32 w1_t = swap32 (w0_t2[1]); u32x w1_t = swap32 (w0_t2[1]);
u32 w2_t = swap32 (w0_t2[2]); u32x w2_t = swap32 (w0_t2[2]);
u32 w3_t = swap32 (w0_t2[3]); u32x w3_t = swap32 (w0_t2[3]);
u32 w4_t = swap32 (w1_t2[0]); u32x w4_t = swap32 (w1_t2[0]);
u32 w5_t = swap32 (w1_t2[1]); u32x w5_t = swap32 (w1_t2[1]);
u32 w6_t = swap32 (w1_t2[2]); u32x w6_t = swap32 (w1_t2[2]);
u32 w7_t = swap32 (w1_t2[3]); u32x w7_t = swap32 (w1_t2[3]);
u32 w8_t = swap32 (w2_t2[0]); u32x w8_t = swap32 (w2_t2[0]);
u32 w9_t = swap32 (w2_t2[1]); u32x w9_t = swap32 (w2_t2[1]);
u32 wa_t = swap32 (w2_t2[2]); u32x wa_t = swap32 (w2_t2[2]);
u32 wb_t = swap32 (w2_t2[3]); u32x wb_t = swap32 (w2_t2[3]);
u32 wc_t = swap32 (w3_t2[0]); u32x wc_t = swap32 (w3_t2[0]);
u32 wd_t = swap32 (w3_t2[1]); u32x wd_t = swap32 (w3_t2[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -311,13 +280,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -345,54 +308,31 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -416,43 +356,35 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -507,10 +439,10 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t2[4]; u32x w0_t2[4];
u32 w1_t2[4]; u32x w1_t2[4];
u32 w2_t2[4]; u32x w2_t2[4];
u32 w3_t2[4]; u32x w3_t2[4];
make_unicode (w0, w0_t2, w1_t2); make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2); make_unicode (w1, w2_t2, w3_t2);
@ -538,31 +470,31 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0_t2[0]); u32x w0_t = swap32 (w0_t2[0]);
u32 w1_t = swap32 (w0_t2[1]); u32x w1_t = swap32 (w0_t2[1]);
u32 w2_t = swap32 (w0_t2[2]); u32x w2_t = swap32 (w0_t2[2]);
u32 w3_t = swap32 (w0_t2[3]); u32x w3_t = swap32 (w0_t2[3]);
u32 w4_t = swap32 (w1_t2[0]); u32x w4_t = swap32 (w1_t2[0]);
u32 w5_t = swap32 (w1_t2[1]); u32x w5_t = swap32 (w1_t2[1]);
u32 w6_t = swap32 (w1_t2[2]); u32x w6_t = swap32 (w1_t2[2]);
u32 w7_t = swap32 (w1_t2[3]); u32x w7_t = swap32 (w1_t2[3]);
u32 w8_t = swap32 (w2_t2[0]); u32x w8_t = swap32 (w2_t2[0]);
u32 w9_t = swap32 (w2_t2[1]); u32x w9_t = swap32 (w2_t2[1]);
u32 wa_t = swap32 (w2_t2[2]); u32x wa_t = swap32 (w2_t2[2]);
u32 wb_t = swap32 (w2_t2[3]); u32x wb_t = swap32 (w2_t2[3]);
u32 wc_t = swap32 (w3_t2[0]); u32x wc_t = swap32 (w3_t2[0]);
u32 wd_t = swap32 (w3_t2[1]); u32x wd_t = swap32 (w3_t2[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -632,13 +564,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,54 +36,31 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -95,43 +72,35 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -156,10 +125,10 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t2[4]; u32x w0_t2[4];
u32 w1_t2[4]; u32x w1_t2[4];
u32 w2_t2[4]; u32x w2_t2[4];
u32 w3_t2[4]; u32x w3_t2[4];
make_unicode (w0, w0_t2, w1_t2); make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2); make_unicode (w1, w2_t2, w3_t2);
@ -181,31 +150,31 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0_t2[0]); u32x w0_t = swap32 (w0_t2[0]);
u32 w1_t = swap32 (w0_t2[1]); u32x w1_t = swap32 (w0_t2[1]);
u32 w2_t = swap32 (w0_t2[2]); u32x w2_t = swap32 (w0_t2[2]);
u32 w3_t = swap32 (w0_t2[3]); u32x w3_t = swap32 (w0_t2[3]);
u32 w4_t = swap32 (w1_t2[0]); u32x w4_t = swap32 (w1_t2[0]);
u32 w5_t = swap32 (w1_t2[1]); u32x w5_t = swap32 (w1_t2[1]);
u32 w6_t = swap32 (w1_t2[2]); u32x w6_t = swap32 (w1_t2[2]);
u32 w7_t = swap32 (w1_t2[3]); u32x w7_t = swap32 (w1_t2[3]);
u32 w8_t = swap32 (w2_t2[0]); u32x w8_t = swap32 (w2_t2[0]);
u32 w9_t = swap32 (w2_t2[1]); u32x w9_t = swap32 (w2_t2[1]);
u32 wa_t = swap32 (w2_t2[2]); u32x wa_t = swap32 (w2_t2[2]);
u32 wb_t = swap32 (w2_t2[3]); u32x wb_t = swap32 (w2_t2[3]);
u32 wc_t = swap32 (w3_t2[0]); u32x wc_t = swap32 (w3_t2[0]);
u32 wd_t = swap32 (w3_t2[1]); u32x wd_t = swap32 (w3_t2[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -275,13 +244,7 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -309,54 +272,31 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -380,43 +320,35 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -441,10 +373,10 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t2[4]; u32x w0_t2[4];
u32 w1_t2[4]; u32x w1_t2[4];
u32 w2_t2[4]; u32x w2_t2[4];
u32 w3_t2[4]; u32x w3_t2[4];
make_unicode (w0, w0_t2, w1_t2); make_unicode (w0, w0_t2, w1_t2);
make_unicode (w1, w2_t2, w3_t2); make_unicode (w1, w2_t2, w3_t2);
@ -466,31 +398,31 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha256 * sha256
*/ */
u32 w0_t = swap32 (w0_t2[0]); u32x w0_t = swap32 (w0_t2[0]);
u32 w1_t = swap32 (w0_t2[1]); u32x w1_t = swap32 (w0_t2[1]);
u32 w2_t = swap32 (w0_t2[2]); u32x w2_t = swap32 (w0_t2[2]);
u32 w3_t = swap32 (w0_t2[3]); u32x w3_t = swap32 (w0_t2[3]);
u32 w4_t = swap32 (w1_t2[0]); u32x w4_t = swap32 (w1_t2[0]);
u32 w5_t = swap32 (w1_t2[1]); u32x w5_t = swap32 (w1_t2[1]);
u32 w6_t = swap32 (w1_t2[2]); u32x w6_t = swap32 (w1_t2[2]);
u32 w7_t = swap32 (w1_t2[3]); u32x w7_t = swap32 (w1_t2[3]);
u32 w8_t = swap32 (w2_t2[0]); u32x w8_t = swap32 (w2_t2[0]);
u32 w9_t = swap32 (w2_t2[1]); u32x w9_t = swap32 (w2_t2[1]);
u32 wa_t = swap32 (w2_t2[2]); u32x wa_t = swap32 (w2_t2[2]);
u32 wb_t = swap32 (w2_t2[3]); u32x wb_t = swap32 (w2_t2[3]);
u32 wc_t = swap32 (w3_t2[0]); u32x wc_t = swap32 (w3_t2[0]);
u32 wd_t = swap32 (w3_t2[1]); u32x wd_t = swap32 (w3_t2[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_salt_len * 8; u32x wf_t = pw_salt_len * 8;
u32 a = SHA256M_A; u32x a = SHA256M_A;
u32 b = SHA256M_B; u32x b = SHA256M_B;
u32 c = SHA256M_C; u32x c = SHA256M_C;
u32 d = SHA256M_D; u32x d = SHA256M_D;
u32 e = SHA256M_E; u32x e = SHA256M_E;
u32 f = SHA256M_F; u32x f = SHA256M_F;
u32 g = SHA256M_G; u32x g = SHA256M_G;
u32 h = SHA256M_H; u32x h = SHA256M_H;
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00);
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01);
@ -560,13 +492,7 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 k_sha256[64] = __constant u32 k_sha256[64] =
{ {
@ -243,41 +243,20 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -302,67 +281,53 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -373,28 +338,28 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -427,12 +392,7 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_M
} }
} }
@ -460,41 +420,20 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -531,67 +470,53 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -602,28 +527,28 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -656,12 +581,7 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 k_sha256[64] = __constant u32 k_sha256[64] =
{ {
@ -243,41 +243,20 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -337,67 +316,53 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -427,12 +392,7 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_M
} }
} }
@ -460,41 +420,20 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -566,67 +505,53 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -656,12 +581,7 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_S
} }
} }

View File

@ -7,6 +7,8 @@
#define _DES_ #define _DES_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define PERM_OP(a,b,tt,n,m) \ #define PERM_OP(a,b,tt,n,m) \
{ \ { \
@ -520,7 +520,7 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -565,69 +565,49 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 8) ? 8 : pw_len; pw_len = (pw_len >= 8) ? 8 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0;
wordr0[3] = 0;
u32 wordr1[4];
wordr1[0] = 0;
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -648,12 +628,10 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_M COMPARE_M_SIMD (iv[0], iv[1], c, d);
} }
} }
@ -711,7 +689,7 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -768,69 +746,49 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 8) ? 8 : pw_len; pw_len = (pw_len >= 8) ? 8 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0;
wordr0[3] = 0;
u32 wordr1[4];
wordr1[0] = 0;
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -851,12 +809,10 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, mask, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_S COMPARE_S_SIMD (iv[0], iv[1], c, d);
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -157,92 +157,53 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -265,10 +226,10 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA512 * SHA512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
@ -334,43 +295,20 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -387,51 +325,35 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
append_0x80_2x4 (wordr0, wordr1, pw_r_len);
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -454,10 +376,10 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA512 * SHA512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -157,54 +157,31 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -216,39 +193,25 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -287,14 +250,14 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
@ -319,10 +282,10 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha512 * sha512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
@ -388,54 +351,31 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -459,39 +399,25 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -530,14 +456,14 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
@ -562,10 +488,10 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha512 * sha512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -157,54 +157,31 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -216,43 +193,35 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -275,7 +244,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
@ -294,10 +263,10 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha512 * sha512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
@ -363,54 +332,31 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -434,43 +380,35 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -493,7 +431,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len);
@ -512,10 +450,10 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha512 * sha512
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -157,54 +157,31 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -216,43 +193,35 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -307,10 +276,10 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -402,54 +371,31 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -473,43 +419,35 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -564,10 +502,10 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -157,54 +157,31 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -216,43 +193,35 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -277,10 +246,10 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -366,54 +335,31 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -437,43 +383,35 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -498,10 +436,10 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -274,41 +274,20 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -333,67 +312,53 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -404,28 +369,28 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -492,41 +457,20 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -563,67 +507,53 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -634,28 +564,28 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;

View File

@ -5,6 +5,8 @@
#define _SHA512_ #define _SHA512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha512[80] = __constant u64 k_sha512[80] =
{ {
@ -274,41 +274,20 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -368,67 +347,53 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -492,41 +457,20 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -598,67 +542,53 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,106 +36,71 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -147,10 +112,10 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[0] = 0x80; w1[0] = 0x80;
w3[2] = 16 * 8; w3[2] = 16 * 8;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -224,13 +189,7 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d &= 0x00ffffff; d &= 0x00ffffff;
c &= 0x00ffffff; c &= 0x00ffffff;
b &= 0x00ffffff; b &= 0x00ffffff;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -258,41 +217,20 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -309,67 +247,53 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -381,10 +305,10 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[0] = 0x80; w1[0] = 0x80;
w3[2] = 16 * 8; w3[2] = 16 * 8;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -463,13 +387,7 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d &= 0x00ffffff; d &= 0x00ffffff;
c &= 0x00ffffff; c &= 0x00ffffff;
b &= 0x00ffffff; b &= 0x00ffffff;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,41 +36,20 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -88,39 +67,25 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -159,32 +124,32 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[2] = wordl0[2] | wordr0[2] | s0[2];
w0[3] = wordl0[3] | wordr0[3] | s0[3]; w0[3] = wordl0[3] | wordr0[3] | s0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -196,10 +161,10 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[0] = 0x80; w1[0] = 0x80;
w3[2] = 16 * 8; w3[2] = 16 * 8;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -273,13 +238,7 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d &= 0x00ffffff; d &= 0x00ffffff;
c &= 0x00ffffff; c &= 0x00ffffff;
b &= 0x00ffffff; b &= 0x00ffffff;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -307,41 +266,20 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -371,39 +309,25 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -442,32 +366,32 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
s3[2] = 0; s3[2] = 0;
s3[3] = 0; s3[3] = 0;
switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); switch_buffer_by_offset_le_S (s0, s1, s2, s3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0] | s0[0]; w0[0] = wordl0[0] | wordr0[0] | s0[0];
w0[1] = wordl0[1] | wordr0[1] | s0[1]; w0[1] = wordl0[1] | wordr0[1] | s0[1];
w0[2] = wordl0[2] | wordr0[2] | s0[2]; w0[2] = wordl0[2] | wordr0[2] | s0[2];
w0[3] = wordl0[3] | wordr0[3] | s0[3]; w0[3] = wordl0[3] | wordr0[3] | s0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -479,10 +403,10 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1[0] = 0x80; w1[0] = 0x80;
w3[2] = 16 * 8; w3[2] = 16 * 8;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -561,13 +485,7 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
d &= 0x00ffffff; d &= 0x00ffffff;
c &= 0x00ffffff; c &= 0x00ffffff;
b &= 0x00ffffff; b &= 0x00ffffff;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -36,43 +36,20 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -113,77 +90,63 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -357,12 +320,7 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
const u32 r0 = a; COMPARE_M_SIMD (a, d, c, b);
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -388,43 +346,20 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -477,77 +412,63 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -720,13 +641,7 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -36,43 +36,20 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -115,77 +92,63 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -441,13 +404,7 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -473,43 +430,20 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -564,77 +498,63 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -893,13 +813,7 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -36,43 +36,20 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -114,77 +91,63 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -440,13 +403,7 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -472,43 +429,20 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -562,77 +496,63 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -891,13 +811,7 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -7,6 +7,8 @@
#define _DES_ #define _DES_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define PERM_OP(a,b,tt,n,m) \ #define PERM_OP(a,b,tt,n,m) \
{ \ { \
@ -517,7 +517,7 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -556,69 +556,55 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 7) ? 7 : pw_len; pw_len = (pw_len >= 7) ? 7 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = 0; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = 0; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -646,12 +632,10 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_M COMPARE_M_SIMD (iv[0], iv[1], c, d);
} }
} }
@ -709,7 +693,7 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -760,69 +744,49 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 7) ? 7 : pw_len; pw_len = (pw_len >= 7) ? 7 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0;
wordr0[3] = 0;
u32 wordr1[4];
wordr1[0] = 0;
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -850,12 +814,10 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_S COMPARE_S_SIMD (iv[0], iv[1], c, d);
} }
} }

View File

@ -7,6 +7,8 @@
#define _DES_ #define _DES_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define PERM_OP(a,b,tt,n,m) \ #define PERM_OP(a,b,tt,n,m) \
{ \ { \
@ -558,7 +558,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -566,14 +566,12 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -585,69 +583,55 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
const u32 salt_word_len = (salt_len + pw_len) * 2; const u32 salt_word_len = (salt_len + pw_len) * 2;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -658,10 +642,10 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
@ -801,12 +785,10 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* cmp * cmp
*/ */
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_M COMPARE_M_SIMD (iv[0], iv[1], c, d);
} }
} }
@ -864,54 +846,31 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
u32 salt_buf0[4]; u32 salt_buf0[4];
u32 salt_buf1[4];
salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0]; salt_buf0[0] = salt_bufs[salt_pos].salt_buf[0];
salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1]; salt_buf0[1] = salt_bufs[salt_pos].salt_buf[1];
salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2]; salt_buf0[2] = salt_bufs[salt_pos].salt_buf[2];
salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3]; salt_buf0[3] = salt_bufs[salt_pos].salt_buf[3];
u32 salt_buf1[4];
salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4]; salt_buf1[0] = salt_bufs[salt_pos].salt_buf[4];
salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5]; salt_buf1[1] = salt_bufs[salt_pos].salt_buf[5];
salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6]; salt_buf1[2] = salt_bufs[salt_pos].salt_buf[6];
@ -935,69 +894,55 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
const u32 salt_word_len = (salt_len + pw_len) * 2; const u32 salt_word_len = (salt_len + pw_len) * 2;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -1008,10 +953,10 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
@ -1151,12 +1096,10 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* cmp * cmp
*/ */
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_S COMPARE_S_SIMD (iv[0], iv[1], c, d);
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -55,43 +55,20 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -132,77 +109,63 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -277,10 +240,10 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c += MD5M_C; c += MD5M_C;
d += MD5M_D; d += MD5M_D;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
| uint_to_hex_lower8 ((a >> 8) & 255) << 16; | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
@ -410,13 +373,7 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -461,43 +418,20 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -550,77 +484,63 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -695,10 +615,10 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c += MD5M_C; c += MD5M_C;
d += MD5M_D; d += MD5M_D;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0
| uint_to_hex_lower8 ((a >> 8) & 255) << 16; | uint_to_hex_lower8 ((a >> 8) & 255) << 16;
@ -828,13 +748,7 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -34,41 +34,20 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -107,67 +86,53 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -178,28 +143,28 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = w2[0]; w2_t[0] = w2[0];
w2_t[1] = w2[1]; w2_t[1] = w2[1];
w2_t[2] = w2[2]; w2_t[2] = w2[2];
w2_t[3] = w2[3]; w2_t[3] = w2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = w3[0]; w3_t[0] = w3[0];
w3_t[1] = w3[1]; w3_t[1] = w3[1];
@ -286,10 +251,10 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -359,13 +324,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -391,41 +350,20 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -476,67 +414,53 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -547,28 +471,28 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = w2[0]; w2_t[0] = w2[0];
w2_t[1] = w2[1]; w2_t[1] = w2[1];
w2_t[2] = w2[2]; w2_t[2] = w2[2];
w2_t[3] = w2[3]; w2_t[3] = w2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = w3[0]; w3_t[0] = w3[0];
w3_t[1] = w3[1]; w3_t[1] = w3[1];
@ -655,10 +579,10 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -727,13 +651,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -55,43 +55,20 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -113,77 +90,63 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -356,13 +319,7 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -407,43 +364,20 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -477,77 +411,63 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -720,13 +640,7 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_SHA1_ #define _MD5_SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -55,80 +55,43 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -137,28 +100,28 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -169,28 +132,28 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -398,13 +361,7 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -449,43 +406,20 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -502,39 +436,25 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -543,28 +463,28 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -575,28 +495,28 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -804,13 +724,7 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] #define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
@ -55,80 +55,43 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -137,28 +100,28 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -169,28 +132,28 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -427,13 +390,7 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -478,43 +435,20 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -531,45 +465,31 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -578,28 +498,28 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -610,28 +530,28 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -865,19 +785,13 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_MD5_ #define _SHA1_MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -17,9 +19,7 @@
#undef _MD5_ #undef _MD5_
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] #define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
@ -56,80 +56,43 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -138,28 +101,28 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -170,10 +133,10 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -252,31 +215,31 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 u32x w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
| uint_to_hex_lower8_le ((a >> 0) & 255) << 16; | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 u32x w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
| uint_to_hex_lower8_le ((a >> 16) & 255) << 16; | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 u32x w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
| uint_to_hex_lower8_le ((b >> 0) & 255) << 16; | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 u32x w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
| uint_to_hex_lower8_le ((b >> 16) & 255) << 16; | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 u32x w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
| uint_to_hex_lower8_le ((c >> 0) & 255) << 16; | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 u32x w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
| uint_to_hex_lower8_le ((c >> 16) & 255) << 16; | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 u32x w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
| uint_to_hex_lower8_le ((d >> 0) & 255) << 16; | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 u32x w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
| uint_to_hex_lower8_le ((d >> 16) & 255) << 16; | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
u32 w8_t = 0x80000000; u32x w8_t = 0x80000000;
u32 w9_t = 0; u32x w9_t = 0;
u32 wa_t = 0; u32x wa_t = 0;
u32 wb_t = 0; u32x wb_t = 0;
u32 wc_t = 0; u32x wc_t = 0;
u32 wd_t = 0; u32x wd_t = 0;
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = 32 * 8; u32x wf_t = 32 * 8;
u32 e; u32 e;
@ -381,13 +344,7 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -432,43 +389,20 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -485,45 +419,31 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -532,28 +452,28 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -564,10 +484,10 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -646,31 +566,31 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 u32x w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0
| uint_to_hex_lower8_le ((a >> 0) & 255) << 16; | uint_to_hex_lower8_le ((a >> 0) & 255) << 16;
u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 u32x w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0
| uint_to_hex_lower8_le ((a >> 16) & 255) << 16; | uint_to_hex_lower8_le ((a >> 16) & 255) << 16;
u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 u32x w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0
| uint_to_hex_lower8_le ((b >> 0) & 255) << 16; | uint_to_hex_lower8_le ((b >> 0) & 255) << 16;
u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 u32x w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0
| uint_to_hex_lower8_le ((b >> 16) & 255) << 16; | uint_to_hex_lower8_le ((b >> 16) & 255) << 16;
u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 u32x w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0
| uint_to_hex_lower8_le ((c >> 0) & 255) << 16; | uint_to_hex_lower8_le ((c >> 0) & 255) << 16;
u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 u32x w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0
| uint_to_hex_lower8_le ((c >> 16) & 255) << 16; | uint_to_hex_lower8_le ((c >> 16) & 255) << 16;
u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 u32x w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0
| uint_to_hex_lower8_le ((d >> 0) & 255) << 16; | uint_to_hex_lower8_le ((d >> 0) & 255) << 16;
u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 u32x w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0
| uint_to_hex_lower8_le ((d >> 16) & 255) << 16; | uint_to_hex_lower8_le ((d >> 16) & 255) << 16;
u32 w8_t = 0x80000000; u32x w8_t = 0x80000000;
u32 w9_t = 0; u32x w9_t = 0;
u32 wa_t = 0; u32x wa_t = 0;
u32 wb_t = 0; u32x wb_t = 0;
u32 wc_t = 0; u32x wc_t = 0;
u32 wd_t = 0; u32x wd_t = 0;
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = 32 * 8; u32x wf_t = 32 * 8;
u32 e; u32 e;
@ -772,19 +692,13 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,41 +36,20 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -89,67 +68,53 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -198,7 +163,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
w0[0] |= s0[0]; w0[0] |= s0[0];
w0[1] |= s0[1]; w0[1] |= s0[1];
@ -221,10 +186,10 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -293,13 +258,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -327,41 +286,20 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -392,67 +330,53 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -501,7 +425,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
w0[0] |= s0[0]; w0[0] |= s0[0];
w0[1] |= s0[1]; w0[1] |= s0[1];
@ -524,10 +448,10 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 * md5
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -594,20 +518,12 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30);
bool q_cond = allx (search[0] != a); if (MATCHES_NONE_VS (a, search[0])) continue;
if (q_cond) continue;
MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,41 +36,20 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -109,67 +88,53 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = wordl0[0] | wordr0[0]; w0_t[0] = wordl0[0] | wordr0[0];
w0_t[1] = wordl0[1] | wordr0[1]; w0_t[1] = wordl0[1] | wordr0[1];
w0_t[2] = wordl0[2] | wordr0[2]; w0_t[2] = wordl0[2] | wordr0[2];
w0_t[3] = wordl0[3] | wordr0[3]; w0_t[3] = wordl0[3] | wordr0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = wordl1[0] | wordr1[0]; w1_t[0] = wordl1[0] | wordr1[0];
w1_t[1] = wordl1[1] | wordr1[1]; w1_t[1] = wordl1[1] | wordr1[1];
w1_t[2] = wordl1[2] | wordr1[2]; w1_t[2] = wordl1[2] | wordr1[2];
w1_t[3] = wordl1[3] | wordr1[3]; w1_t[3] = wordl1[3] | wordr1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = wordl2[0] | wordr2[0]; w2_t[0] = wordl2[0] | wordr2[0];
w2_t[1] = wordl2[1] | wordr2[1]; w2_t[1] = wordl2[1] | wordr2[1];
w2_t[2] = wordl2[2] | wordr2[2]; w2_t[2] = wordl2[2] | wordr2[2];
w2_t[3] = wordl2[3] | wordr2[3]; w2_t[3] = wordl2[3] | wordr2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = wordl3[0] | wordr3[0]; w3_t[0] = wordl3[0] | wordr3[0];
w3_t[1] = wordl3[1] | wordr3[1]; w3_t[1] = wordl3[1] | wordr3[1];
@ -254,32 +219,32 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len);
u32 w0 = swap32 (w0_t[0]); u32x w0 = swap32 (w0_t[0]);
u32 w1 = swap32 (w0_t[1]); u32x w1 = swap32 (w0_t[1]);
u32 w2 = swap32 (w0_t[2]); u32x w2 = swap32 (w0_t[2]);
u32 w3 = swap32 (w0_t[3]); u32x w3 = swap32 (w0_t[3]);
u32 w4 = swap32 (w1_t[0]); u32x w4 = swap32 (w1_t[0]);
u32 w5 = swap32 (w1_t[1]); u32x w5 = swap32 (w1_t[1]);
u32 w6 = swap32 (w1_t[2]); u32x w6 = swap32 (w1_t[2]);
u32 w7 = swap32 (w1_t[3]); u32x w7 = swap32 (w1_t[3]);
u32 w8 = swap32 (w2_t[0]); u32x w8 = swap32 (w2_t[0]);
u32 w9 = swap32 (w2_t[1]); u32x w9 = swap32 (w2_t[1]);
u32 wa = swap32 (w2_t[2]); u32x wa = swap32 (w2_t[2]);
u32 wb = swap32 (w2_t[3]); u32x wb = swap32 (w2_t[3]);
u32 wc = swap32 (w3_t[0]); u32x wc = swap32 (w3_t[0]);
u32 wd = swap32 (w3_t[1]); u32x wd = swap32 (w3_t[1]);
u32 we = 0; u32x we = 0;
u32 wf = pw_salt_len * 8; u32x wf = pw_salt_len * 8;
/** /**
* sha1 * sha1
*/ */
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -376,13 +341,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd); wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd);
we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we);
wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -410,41 +369,20 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -495,73 +433,59 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = wordl0[0] | wordr0[0]; w0_t[0] = wordl0[0] | wordr0[0];
w0_t[1] = wordl0[1] | wordr0[1]; w0_t[1] = wordl0[1] | wordr0[1];
w0_t[2] = wordl0[2] | wordr0[2]; w0_t[2] = wordl0[2] | wordr0[2];
w0_t[3] = wordl0[3] | wordr0[3]; w0_t[3] = wordl0[3] | wordr0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = wordl1[0] | wordr1[0]; w1_t[0] = wordl1[0] | wordr1[0];
w1_t[1] = wordl1[1] | wordr1[1]; w1_t[1] = wordl1[1] | wordr1[1];
w1_t[2] = wordl1[2] | wordr1[2]; w1_t[2] = wordl1[2] | wordr1[2];
w1_t[3] = wordl1[3] | wordr1[3]; w1_t[3] = wordl1[3] | wordr1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = wordl2[0] | wordr2[0]; w2_t[0] = wordl2[0] | wordr2[0];
w2_t[1] = wordl2[1] | wordr2[1]; w2_t[1] = wordl2[1] | wordr2[1];
w2_t[2] = wordl2[2] | wordr2[2]; w2_t[2] = wordl2[2] | wordr2[2];
w2_t[3] = wordl2[3] | wordr2[3]; w2_t[3] = wordl2[3] | wordr2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = wordl3[0] | wordr3[0]; w3_t[0] = wordl3[0] | wordr3[0];
w3_t[1] = wordl3[1] | wordr3[1]; w3_t[1] = wordl3[1] | wordr3[1];
@ -646,32 +570,32 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len); append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_salt_len);
u32 w0 = swap32 (w0_t[0]); u32x w0 = swap32 (w0_t[0]);
u32 w1 = swap32 (w0_t[1]); u32x w1 = swap32 (w0_t[1]);
u32 w2 = swap32 (w0_t[2]); u32x w2 = swap32 (w0_t[2]);
u32 w3 = swap32 (w0_t[3]); u32x w3 = swap32 (w0_t[3]);
u32 w4 = swap32 (w1_t[0]); u32x w4 = swap32 (w1_t[0]);
u32 w5 = swap32 (w1_t[1]); u32x w5 = swap32 (w1_t[1]);
u32 w6 = swap32 (w1_t[2]); u32x w6 = swap32 (w1_t[2]);
u32 w7 = swap32 (w1_t[3]); u32x w7 = swap32 (w1_t[3]);
u32 w8 = swap32 (w2_t[0]); u32x w8 = swap32 (w2_t[0]);
u32 w9 = swap32 (w2_t[1]); u32x w9 = swap32 (w2_t[1]);
u32 wa = swap32 (w2_t[2]); u32x wa = swap32 (w2_t[2]);
u32 wb = swap32 (w2_t[3]); u32x wb = swap32 (w2_t[3]);
u32 wc = swap32 (w3_t[0]); u32x wc = swap32 (w3_t[0]);
u32 wd = swap32 (w3_t[1]); u32x wd = swap32 (w3_t[1]);
u32 we = 0; u32x we = 0;
u32 wf = pw_salt_len * 8; u32x wf = pw_salt_len * 8;
/** /**
* sha1 * sha1
*/ */
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -765,19 +689,13 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wa = rotl32 ((w7 ^ w2 ^ wc ^ wa), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa); wa = rotl32 ((w7 ^ w2 ^ wc ^ wa), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa);
wb = rotl32 ((w8 ^ w3 ^ wd ^ wb), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb); wb = rotl32 ((w8 ^ w3 ^ wd ^ wb), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
wc = rotl32 ((w9 ^ w4 ^ we ^ wc), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc); wc = rotl32 ((w9 ^ w4 ^ we ^ wc), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc);
wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd); wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd);
we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we);
wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _KECCAK_ #define _KECCAK_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 keccakf_rndc[24] = __constant u64 keccakf_rndc[24] =
{ {
@ -102,43 +102,20 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x01_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* 0x80 keccak, very special * 0x80 keccak, very special
*/ */
@ -153,39 +130,25 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -194,28 +157,28 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -363,43 +326,20 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x01_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -426,39 +366,25 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -467,28 +393,28 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];

View File

@ -5,6 +5,8 @@
#define _MD5H_ #define _MD5H_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,118 +36,81 @@ __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -319,86 +282,72 @@ __kernel void m05100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
append_0x80_2x4 (wordl0, wordl1, pw_l_len); append_0x80_2x4_S (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{ {
@ -248,106 +248,71 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -358,28 +323,28 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = w2[0]; w2_t[0] = w2[0];
w2_t[1] = w2[1]; w2_t[1] = w2[1];
w2_t[2] = w2[2]; w2_t[2] = w2[2];
w2_t[3] = w2[3]; w2_t[3] = w2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = w3[0]; w3_t[0] = w3[0];
w3_t[1] = w3[1]; w3_t[1] = w3[1];
@ -475,12 +440,7 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[0]; COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
const u32 r1 = digest[3];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -537,41 +497,20 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -588,67 +527,53 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -659,28 +584,28 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -776,12 +701,7 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[0]; COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
const u32 r1 = digest[3];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -282,106 +282,71 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -392,28 +357,28 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -509,12 +474,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -571,41 +531,20 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -622,67 +561,53 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -693,28 +618,28 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -810,12 +735,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -7,6 +7,8 @@
#define _MD4_ #define _MD4_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define PERM_OP(a,b,tt,n,m) \ #define PERM_OP(a,b,tt,n,m) \
{ \ { \
@ -528,43 +528,20 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -582,87 +559,73 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);
@ -822,43 +785,20 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -888,87 +828,73 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
w3_t[2] = pw_len * 8 * 2; w3_t[2] = pw_len * 8 * 2;
u32 a = MD4M_A; u32x a = MD4M_A;
u32 b = MD4M_B; u32x b = MD4M_B;
u32 c = MD4M_C; u32x c = MD4M_C;
u32 d = MD4M_D; u32x d = MD4M_D;
MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00); MD4_STEP (MD4_Fo, a, b, c, d, w0_t[0], MD4C00, MD4S00);
MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01); MD4_STEP (MD4_Fo, d, a, b, c, w0_t[1], MD4C00, MD4S01);

View File

@ -5,6 +5,8 @@
#define _NETNTLMV2_ #define _NETNTLMV2_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{ {
@ -326,118 +326,81 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -589,12 +552,7 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[0]; COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]);
const u32 r1 = digest[3];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -648,43 +606,20 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -701,77 +636,63 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -923,12 +844,7 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[0]; COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]);
const u32 r1 = digest[3];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _RIPEMD160_ #define _RIPEMD160_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void ripemd160_transform (const u32 w[16], u32 dgst[5]) static void ripemd160_transform (const u32 w[16], u32 dgst[5])
{ {
@ -233,115 +233,78 @@ __kernel void m06000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 wl[16]; u32x wl[16];
wl[ 0] = w0[0]; wl[ 0] = w0[0];
wl[ 1] = w0[1]; wl[ 1] = w0[1];
@ -403,43 +366,20 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -456,74 +396,60 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = pw_len * 8; w3[2] = pw_len * 8;
w3[3] = 0; w3[3] = 0;
u32 wl[16]; u32x wl[16];
wl[ 0] = w0[0]; wl[ 0] = w0[0];
wl[ 1] = w0[1]; wl[ 1] = w0[1];

View File

@ -7,6 +7,8 @@
#define _WHIRLPOOL_ #define _WHIRLPOOL_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define R 10 #define R 10
@ -1381,115 +1381,78 @@ __kernel void m06100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 wl[16]; u32x wl[16];
wl[ 0] = swap32 (w0[0]); wl[ 0] = swap32 (w0[0]);
wl[ 1] = swap32 (w0[1]); wl[ 1] = swap32 (w0[1]);
@ -1592,43 +1555,20 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -1645,74 +1585,60 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = 0; w3[2] = 0;
w3[3] = 0; w3[3] = 0;
u32 wl[16]; u32x wl[16];
wl[ 0] = swap32 (w0[0]); wl[ 0] = swap32 (w0[0]);
wl[ 1] = swap32 (w0[1]); wl[ 1] = swap32 (w0[1]);

View File

@ -7,6 +7,8 @@
#define _GOST_ #define _GOST_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 c_tables[4][256] = __constant u32 c_tables[4][256] =
{ {
@ -727,106 +727,71 @@ __kernel void m06900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -1029,41 +994,20 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -1080,67 +1024,53 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -253,41 +253,20 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -298,67 +277,53 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -369,28 +334,28 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -450,12 +415,7 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -483,41 +443,20 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -540,67 +479,53 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -611,28 +536,28 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* pads * pads
*/ */
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -692,12 +617,7 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _KRB5PA_ #define _KRB5PA_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -559,41 +561,20 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -620,67 +601,53 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -736,41 +703,20 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -797,67 +743,53 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -55,43 +55,20 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -132,39 +109,25 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -173,28 +136,28 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -205,28 +168,28 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -334,7 +297,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* Prepend salt * Prepend salt
*/ */
u32 w0t[4]; u32x w0t[4];
w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0
| uint_to_hex_lower8 ((a >> 16) & 255) << 16; | uint_to_hex_lower8 ((a >> 16) & 255) << 16;
@ -345,7 +308,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0
| uint_to_hex_lower8 ((b >> 0) & 255) << 16; | uint_to_hex_lower8 ((b >> 0) & 255) << 16;
u32 w1t[4]; u32x w1t[4];
w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0
| uint_to_hex_lower8 ((c >> 16) & 255) << 16; | uint_to_hex_lower8 ((c >> 16) & 255) << 16;
@ -356,7 +319,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0
| uint_to_hex_lower8 ((d >> 0) & 255) << 16; | uint_to_hex_lower8 ((d >> 0) & 255) << 16;
u32 w2t[2]; u32x w2t[2];
w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0
| uint_to_hex_lower8 ((e >> 16) & 255) << 16; | uint_to_hex_lower8 ((e >> 16) & 255) << 16;
@ -637,13 +600,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c += r_c; c += r_c;
d += r_d; d += r_d;
e += r_e; e += r_e;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -688,43 +645,20 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -777,45 +711,31 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -824,28 +744,28 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -856,28 +776,28 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -985,7 +905,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* Prepend salt * Prepend salt
*/ */
u32 w0t[4]; u32x w0t[4];
w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0
| uint_to_hex_lower8 ((a >> 16) & 255) << 16; | uint_to_hex_lower8 ((a >> 16) & 255) << 16;
@ -996,7 +916,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0
| uint_to_hex_lower8 ((b >> 0) & 255) << 16; | uint_to_hex_lower8 ((b >> 0) & 255) << 16;
u32 w1t[4]; u32x w1t[4];
w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0
| uint_to_hex_lower8 ((c >> 16) & 255) << 16; | uint_to_hex_lower8 ((c >> 16) & 255) << 16;
@ -1007,7 +927,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0
| uint_to_hex_lower8 ((d >> 0) & 255) << 16; | uint_to_hex_lower8 ((d >> 0) & 255) << 16;
u32 w2t[2]; u32x w2t[2];
w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0
| uint_to_hex_lower8 ((e >> 16) & 255) << 16; | uint_to_hex_lower8 ((e >> 16) & 255) << 16;
@ -1288,13 +1208,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c += r_c; c += r_c;
d += r_d; d += r_d;
e += r_e; e += r_e;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -7,6 +7,8 @@
#define _SHA256_ #define _SHA256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 k_sha256[64] = __constant u32 k_sha256[64] =
{ {
@ -289,78 +289,49 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -379,16 +350,16 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = swap32 (w0_t[0]); w_t[ 0] = swap32 (w0_t[0]);
w_t[ 1] = swap32 (w0_t[1]); w_t[ 1] = swap32 (w0_t[1]);
@ -445,12 +416,7 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s1); // 448 - 512
sha256_transform_s (digest, w_s2); // 512 - 576 sha256_transform_s (digest, w_s2); // 512 - 576
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_M
} }
} }
@ -525,41 +491,20 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -576,39 +521,31 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -627,16 +564,16 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = swap32 (w0_t[0]); w_t[ 0] = swap32 (w0_t[0]);
w_t[ 1] = swap32 (w0_t[1]); w_t[ 1] = swap32 (w0_t[1]);
@ -693,12 +630,7 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s1); // 448 - 512
sha256_transform_s (digest, w_s2); // 512 - 576 sha256_transform_s (digest, w_s2); // 512 - 576
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]);
const u32 r1 = digest[7];
const u32 r2 = digest[2];
const u32 r3 = digest[6];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,41 +36,20 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -86,43 +65,35 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -145,12 +116,12 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
@ -192,11 +163,11 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -293,13 +264,7 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -327,41 +292,20 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -389,49 +333,41 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* reverse * reverse
*/ */
const u32 e_rev = rotl32 (search[1], 2u); const u32 e_rev = rotl32_S (search[1], 2u);
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -454,12 +390,12 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* prepend salt * prepend salt
*/ */
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
@ -501,11 +437,11 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
//w3_t[2] = swap32 (w3_t[2]); //w3_t[2] = swap32 (w3_t[2]);
//w3_t[3] = swap32 (w3_t[3]); //w3_t[3] = swap32 (w3_t[3]);
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -600,18 +536,12 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]);
w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]);
if (allx (e != e_rev)) continue; if (MATCHES_NONE_VS (e, e_rev)) continue;
w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]);
w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]);
w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -164,41 +164,20 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -241,43 +220,35 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -296,28 +267,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = w2[0]; w2_t[0] = w2[0];
w2_t[1] = w2[1]; w2_t[1] = w2[1];
w2_t[2] = w2[2]; w2_t[2] = w2[2];
w2_t[3] = w2[3]; w2_t[3] = w2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = w3[0]; w3_t[0] = w3[0];
w3_t[1] = w3[1]; w3_t[1] = w3[1];
@ -396,28 +367,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t2[4]; u32x w0_t2[4];
w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]);
w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]);
w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]);
w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]);
u32 w1_t2[4]; u32x w1_t2[4];
w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]);
w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]);
w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]);
w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]);
u32 w2_t2[4]; u32x w2_t2[4];
w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]);
w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]);
w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]);
w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]);
u32 w3_t2[4]; u32x w3_t2[4];
w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]);
w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]);
@ -438,28 +409,28 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
for (u32 i = 0; i < salt_iter; i++) for (u32 i = 0; i < salt_iter; i++)
{ {
u32 w0_t3[4]; u32x w0_t3[4];
w0_t3[0] = digest[0]; w0_t3[0] = digest[0];
w0_t3[1] = digest[1]; w0_t3[1] = digest[1];
w0_t3[2] = digest[2]; w0_t3[2] = digest[2];
w0_t3[3] = digest[3]; w0_t3[3] = digest[3];
u32 w1_t3[4]; u32x w1_t3[4];
w1_t3[0] = digest[4]; w1_t3[0] = digest[4];
w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[1] = swap32 (salt_buf0[0]);
w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[2] = swap32 (salt_buf0[1]);
w1_t3[3] = swap32 (salt_buf0[2]); w1_t3[3] = swap32 (salt_buf0[2]);
u32 w2_t3[4]; u32x w2_t3[4];
w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[0] = swap32 (salt_buf0[3]);
w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[1] = swap32 (salt_buf1[0]);
w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[2] = swap32 (salt_buf1[1]);
w2_t3[3] = swap32 (salt_buf1[2]); w2_t3[3] = swap32 (salt_buf1[2]);
u32 w3_t3[4]; u32x w3_t3[4];
w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[0] = swap32 (salt_buf1[3]);
w3_t3[1] = 0; w3_t3[1] = 0;
@ -475,12 +446,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest);
} }
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -508,41 +474,20 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -597,43 +542,35 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -652,28 +589,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = w0[0]; w0_t[0] = w0[0];
w0_t[1] = w0[1]; w0_t[1] = w0[1];
w0_t[2] = w0[2]; w0_t[2] = w0[2];
w0_t[3] = w0[3]; w0_t[3] = w0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = w1[0]; w1_t[0] = w1[0];
w1_t[1] = w1[1]; w1_t[1] = w1[1];
w1_t[2] = w1[2]; w1_t[2] = w1[2];
w1_t[3] = w1[3]; w1_t[3] = w1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = w2[0]; w2_t[0] = w2[0];
w2_t[1] = w2[1]; w2_t[1] = w2[1];
w2_t[2] = w2[2]; w2_t[2] = w2[2];
w2_t[3] = w2[3]; w2_t[3] = w2[3];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = w3[0]; w3_t[0] = w3[0];
w3_t[1] = w3[1]; w3_t[1] = w3[1];
@ -752,28 +689,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t2[4]; u32x w0_t2[4];
w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]);
w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]);
w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]);
w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]);
u32 w1_t2[4]; u32x w1_t2[4];
w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]);
w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]);
w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]);
w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]);
u32 w2_t2[4]; u32x w2_t2[4];
w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]);
w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]);
w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]);
w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]);
u32 w3_t2[4]; u32x w3_t2[4];
w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]);
w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]);
@ -794,28 +731,28 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
for (u32 i = 0; i < salt_iter; i++) for (u32 i = 0; i < salt_iter; i++)
{ {
u32 w0_t3[4]; u32x w0_t3[4];
w0_t3[0] = digest[0]; w0_t3[0] = digest[0];
w0_t3[1] = digest[1]; w0_t3[1] = digest[1];
w0_t3[2] = digest[2]; w0_t3[2] = digest[2];
w0_t3[3] = digest[3]; w0_t3[3] = digest[3];
u32 w1_t3[4]; u32x w1_t3[4];
w1_t3[0] = digest[4]; w1_t3[0] = digest[4];
w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[1] = swap32 (salt_buf0[0]);
w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[2] = swap32 (salt_buf0[1]);
w1_t3[3] = swap32 (salt_buf0[2]); w1_t3[3] = swap32 (salt_buf0[2]);
u32 w2_t3[4]; u32x w2_t3[4];
w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[0] = swap32 (salt_buf0[3]);
w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[1] = swap32 (salt_buf1[0]);
w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[2] = swap32 (salt_buf1[1]);
w2_t3[3] = swap32 (salt_buf1[2]); w2_t3[3] = swap32 (salt_buf1[2]);
u32 w3_t3[4]; u32x w3_t3[4];
w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[0] = swap32 (salt_buf1[3]);
w3_t3[1] = 0; w3_t3[1] = 0;
@ -831,12 +768,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest);
} }
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] #define uint_to_hex_lower8_le(i) l_bin2asc[(i)]
@ -183,43 +183,20 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -251,39 +228,25 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -292,56 +255,56 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = swap32 (w2[0]); w2_t[0] = swap32 (w2[0]);
w2_t[1] = swap32 (w2[1]); w2_t[1] = swap32 (w2[1]);
w2_t[2] = swap32 (w2[2]); w2_t[2] = swap32 (w2[2]);
w2_t[3] = swap32 (w2[3]); w2_t[3] = swap32 (w2[3]);
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = swap32 (w3[0]); w3_t[0] = swap32 (w3[0]);
w3_t[1] = swap32 (w3[1]); w3_t[1] = swap32 (w3[1]);
@ -484,12 +447,7 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
const u32 r0 = digest[3]; COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_M
} }
} }
@ -534,43 +492,20 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -614,39 +549,25 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -655,56 +576,56 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
w0_t[2] = swap32 (w0[2]); w0_t[2] = swap32 (w0[2]);
w0_t[3] = swap32 (w0[3]); w0_t[3] = swap32 (w0[3]);
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = swap32 (w1[0]); w1_t[0] = swap32 (w1[0]);
w1_t[1] = swap32 (w1[1]); w1_t[1] = swap32 (w1[1]);
w1_t[2] = swap32 (w1[2]); w1_t[2] = swap32 (w1[2]);
w1_t[3] = swap32 (w1[3]); w1_t[3] = swap32 (w1[3]);
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = swap32 (w2[0]); w2_t[0] = swap32 (w2[0]);
w2_t[1] = swap32 (w2[1]); w2_t[1] = swap32 (w2[1]);
w2_t[2] = swap32 (w2[2]); w2_t[2] = swap32 (w2[2]);
w2_t[3] = swap32 (w2[3]); w2_t[3] = swap32 (w2[3]);
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = swap32 (w3[0]); w3_t[0] = swap32 (w3[0]);
w3_t[1] = swap32 (w3[1]); w3_t[1] = swap32 (w3[1]);
@ -847,12 +768,7 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
const u32 r0 = digest[3]; COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]);
const u32 r1 = digest[4];
const u32 r2 = digest[2];
const u32 r3 = digest[1];
#include COMPARE_S
} }
} }

View File

@ -7,6 +7,8 @@
#define _DES_ #define _DES_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define PERM_OP(a,b,tt,n,m) \ #define PERM_OP(a,b,tt,n,m) \
{ \ { \
@ -583,7 +583,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -599,69 +599,49 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 8) ? 8 : pw_len; pw_len = (pw_len >= 8) ? 8 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0;
wordr0[3] = 0;
u32 wordr1[4];
wordr1[0] = 0;
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -689,12 +669,10 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_M COMPARE_M_SIMD (iv[0], iv[1], c, d);
} }
} }
@ -784,7 +762,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{ {
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
} }
/** /**
@ -812,69 +790,49 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* main * main
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
pw_len = (pw_len >= 8) ? 8 : pw_len; pw_len = (pw_len >= 8) ? 8 : pw_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = 0;
wordr0[3] = 0;
u32 wordr1[4];
wordr1[0] = 0;
wordr1[1] = 0;
wordr1[2] = 0;
wordr1[3] = 0;
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = 0; w0[2] = 0;
w0[3] = 0; w0[3] = 0;
u32 w1[4]; u32x w1[4];
w1[0] = 0; w1[0] = 0;
w1[1] = 0; w1[1] = 0;
w1[2] = 0; w1[2] = 0;
w1[3] = 0; w1[3] = 0;
u32 w2[4]; u32x w2[4];
w2[0] = 0; w2[0] = 0;
w2[1] = 0; w2[1] = 0;
w2[2] = 0; w2[2] = 0;
w2[3] = 0; w2[3] = 0;
u32 w3[4]; u32x w3[4];
w3[0] = 0; w3[0] = 0;
w3[1] = 0; w3[1] = 0;
@ -902,12 +860,10 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
_des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans);
const u32 r0 = iv[0]; u32x c = 0;
const u32 r1 = iv[1]; u32x d = 0;
const u32 r2 = 0;
const u32 r3 = 0;
#include COMPARE_S COMPARE_S_SIMD (iv[0], iv[1], c, d);
} }
} }

View File

@ -7,6 +7,8 @@
#define _LOTUS5_ #define _LOTUS5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 lotus_magic_table[256] = __constant u32 lotus_magic_table[256] =
{ {
@ -261,85 +261,50 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -434,41 +399,20 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -485,46 +429,32 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
u32 pw_len = pw_l_len + pw_r_len; u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -543,7 +473,7 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w[14] = wordl3[2] | wordr3[2]; w[14] = wordl3[2] | wordr3[2];
w[15] = wordl3[3] | wordr3[3]; w[15] = wordl3[3] | wordr3[3];
u32 state[4]; u32x state[4];
state[0] = 0; state[0] = 0;
state[1] = 0; state[1] = 0;

View File

@ -7,6 +7,8 @@
#define _LOTUS6_ #define _LOTUS6_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 lotus_magic_table[256] = __constant u32 lotus_magic_table[256] =
{ {
@ -292,41 +292,20 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -338,46 +317,32 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -396,7 +361,7 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w[14] = wordl3[2] | wordr3[2]; w[14] = wordl3[2] | wordr3[2];
w[15] = wordl3[3] | wordr3[3]; w[15] = wordl3[3] | wordr3[3];
u32 state[4]; u32x state[4];
state[0] = 0; state[0] = 0;
state[1] = 0; state[1] = 0;
@ -469,10 +434,10 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
domino_big_md (w, 34, state, s_lotus_magic_table); domino_big_md (w, 34, state, s_lotus_magic_table);
u32 a = state[0] & 0xffffffff; u32x a = state[0] & 0xffffffff;
u32 b = state[1] & 0xffffffff; u32x b = state[1] & 0xffffffff;
u32 c = state[2] & 0x000000ff; u32x c = state[2] & 0x000000ff;
u32 d = state[3] & 0x00000000; u32x d = state[3] & 0x00000000;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;
@ -531,41 +496,20 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -589,46 +533,32 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -647,7 +577,7 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w[14] = wordl3[2] | wordr3[2]; w[14] = wordl3[2] | wordr3[2];
w[15] = wordl3[3] | wordr3[3]; w[15] = wordl3[3] | wordr3[3];
u32 state[4]; u32x state[4];
state[0] = 0; state[0] = 0;
state[1] = 0; state[1] = 0;
@ -720,10 +650,10 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
domino_big_md (w, 34, state, s_lotus_magic_table); domino_big_md (w, 34, state, s_lotus_magic_table);
u32 a = state[0] & 0xffffffff; u32x a = state[0] & 0xffffffff;
u32 b = state[1] & 0xffffffff; u32x b = state[1] & 0xffffffff;
u32 c = state[2] & 0x000000ff; u32x c = state[2] & 0x000000ff;
u32 d = state[3] & 0x00000000; u32x d = state[3] & 0x00000000;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;

View File

@ -5,6 +5,8 @@
#define _OLDOFFICE01_ #define _OLDOFFICE01_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{ {
@ -478,41 +478,20 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -528,67 +507,53 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -597,10 +562,10 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_2x4 (w0, w1, pw_len); append_0x80_2x4 (w0, w1, pw_len);
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -630,8 +595,8 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
gen336 (digest_pre, salt_buf, digest); gen336 (digest_pre, salt_buf, digest);
u32 a = digest[0]; u32x a = digest[0];
u32 b = digest[1] & 0xff; u32x b = digest[1] & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;
@ -666,41 +631,20 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -728,67 +672,53 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -797,10 +727,10 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_2x4 (w0, w1, pw_len); append_0x80_2x4 (w0, w1, pw_len);
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -830,8 +760,8 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
gen336 (digest_pre, salt_buf, digest); gen336 (digest_pre, salt_buf, digest);
u32 a = digest[0]; u32x a = digest[0];
u32 b = digest[1] & 0xff; u32x b = digest[1] & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;

View File

@ -5,6 +5,8 @@
#define _OLDOFFICE34_ #define _OLDOFFICE34_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{ {
@ -164,41 +164,20 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -216,69 +195,55 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -287,10 +252,10 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_2x4 (w0, w1, pw_len); append_0x80_2x4 (w0, w1, pw_len);
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -349,8 +314,8 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
u32 a = swap32 (digest[0]); u32x a = swap32 (digest[0]);
u32 b = swap32 (digest[1]) & 0xff; u32x b = swap32 (digest[1]) & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;
@ -385,41 +350,20 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -449,69 +393,55 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
const u32 pw_salt_len = (pw_len * 2) + salt_len; const u32 pw_salt_len = (pw_len * 2) + salt_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -520,10 +450,10 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_2x4 (w0, w1, pw_len); append_0x80_2x4 (w0, w1, pw_len);
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
make_unicode (w0, w0_t, w1_t); make_unicode (w0, w0_t, w1_t);
make_unicode (w1, w2_t, w3_t); make_unicode (w1, w2_t, w3_t);
@ -582,8 +512,8 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); sha1_transform (w0_t, w1_t, w2_t, w3_t, digest);
u32 a = swap32 (digest[0]); u32x a = swap32 (digest[0]);
u32 b = swap32 (digest[1]) & 0xff; u32x b = swap32 (digest[1]) & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,116 +36,81 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -319,13 +284,7 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -353,41 +312,20 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -404,77 +342,63 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01);
@ -651,13 +575,7 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SIPHASH_ #define _SIPHASH_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define SIPROUND(v0,v1,v2,v3) \ #define SIPROUND(v0,v1,v2,v3) \
(v0) += (v1); \ (v0) += (v1); \
@ -52,41 +52,20 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* base * base
*/ */
@ -105,46 +84,32 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -232,41 +197,20 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -297,46 +241,32 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 padding[8] = __constant u32 padding[8] =
{ {
@ -146,43 +146,20 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* U_buf * U_buf
*/ */
@ -211,77 +188,63 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
// max length supported by pdf11 is 32 // max length supported by pdf11 is 32
@ -353,8 +316,8 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
md5_transform (w0_t, w1_t, w2_t, w3_t, digest); md5_transform (w0_t, w1_t, w2_t, w3_t, digest);
u32 a = digest[0]; u32x a = digest[0];
u32 b = digest[1] & 0xff; u32x b = digest[1] & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;
@ -389,43 +352,20 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -466,77 +406,63 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
w3[2] = wordl3[2] | wordr3[2]; w3[2] = wordl3[2] | wordr3[2];
w3[3] = wordl3[3] | wordr3[3]; w3[3] = wordl3[3] | wordr3[3];
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
// max length supported by pdf11 is 32 // max length supported by pdf11 is 32
@ -608,8 +534,8 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
md5_transform (w0_t, w1_t, w2_t, w3_t, digest); md5_transform (w0_t, w1_t, w2_t, w3_t, digest);
u32 a = digest[0]; u32x a = digest[0];
u32 b = digest[1] & 0xff; u32x b = digest[1] & 0xff;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;

View File

@ -5,6 +5,8 @@
#define _SHA384_ #define _SHA384_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u64 k_sha384[80] = __constant u64 k_sha384[80] =
{ {
@ -156,80 +156,43 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -238,10 +201,10 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -264,10 +227,10 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA384 * SHA384
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);
@ -332,43 +295,20 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -385,39 +325,25 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -426,10 +352,10 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -452,10 +378,10 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* SHA384 * SHA384
*/ */
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = swap32 (w0[0]); w0_t[0] = swap32 (w0[0]);
w0_t[1] = swap32 (w0[1]); w0_t[1] = swap32 (w0[1]);

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,41 +36,20 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -107,43 +86,35 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -164,7 +135,7 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_4x4 (w0, w1, w2, w3, pw_len); append_0x80_4x4 (w0, w1, w2, w3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
/** /**
* prepend salt * prepend salt
@ -172,10 +143,10 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// first step fixed 56 bytes of salt // first step fixed 56 bytes of salt
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
@ -203,10 +174,10 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// first transform // first transform
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -377,13 +348,7 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -411,41 +376,20 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -494,43 +438,35 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32 wordr1[4]; u32x wordr1[4] = { 0 };
u32 wordr2[4]; u32x wordr2[4] = { 0 };
u32 wordr3[4]; u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
wordr1[1] = combs_buf[il_pos].i[5]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = combs_buf[il_pos].i[6]; wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[3] = combs_buf[il_pos].i[7]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
u32 w1[4]; u32x w1[4];
u32 w2[4]; u32x w2[4];
u32 w3[4]; u32x w3[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
@ -551,7 +487,7 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
append_0x80_4x4 (w0, w1, w2, w3, pw_len); append_0x80_4x4 (w0, w1, w2, w3, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
/** /**
* prepend salt * prepend salt
@ -559,10 +495,10 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// first step fixed 56 bytes of salt // first step fixed 56 bytes of salt
u32 w0_t[4]; u32x w0_t[4];
u32 w1_t[4]; u32x w1_t[4];
u32 w2_t[4]; u32x w2_t[4];
u32 w3_t[4]; u32x w3_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
@ -590,10 +526,10 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// first transform // first transform
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -764,13 +700,7 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -55,41 +55,20 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* challenge * challenge
*/ */
@ -122,67 +101,53 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -195,7 +160,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
w0_t[0] |= wordl0[0] | wordr0[0]; w0_t[0] |= wordl0[0] | wordr0[0];
w0_t[1] |= wordl0[1] | wordr0[1]; w0_t[1] |= wordl0[1] | wordr0[1];
@ -221,10 +186,10 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 ($pass.$salt) * md5 ($pass.$salt)
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -404,13 +369,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -455,41 +414,20 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* challenge * challenge
*/ */
@ -534,67 +472,53 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = salt_buf0[0]; w0_t[0] = salt_buf0[0];
w0_t[1] = salt_buf0[1]; w0_t[1] = salt_buf0[1];
w0_t[2] = salt_buf0[2]; w0_t[2] = salt_buf0[2];
w0_t[3] = salt_buf0[3]; w0_t[3] = salt_buf0[3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = salt_buf1[0]; w1_t[0] = salt_buf1[0];
w1_t[1] = salt_buf1[1]; w1_t[1] = salt_buf1[1];
w1_t[2] = salt_buf1[2]; w1_t[2] = salt_buf1[2];
w1_t[3] = salt_buf1[3]; w1_t[3] = salt_buf1[3];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = 0; w2_t[0] = 0;
w2_t[1] = 0; w2_t[1] = 0;
w2_t[2] = 0; w2_t[2] = 0;
w2_t[3] = 0; w2_t[3] = 0;
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = 0; w3_t[0] = 0;
w3_t[1] = 0; w3_t[1] = 0;
@ -607,7 +531,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len);
const u32 pw_salt_len = pw_len + salt_len; const u32x pw_salt_len = pw_len + salt_len;
w0_t[0] |= wordl0[0] | wordr0[0]; w0_t[0] |= wordl0[0] | wordr0[0];
w0_t[1] |= wordl0[1] | wordr0[1]; w0_t[1] |= wordl0[1] | wordr0[1];
@ -633,10 +557,10 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* md5 ($pass.$salt) * md5 ($pass.$salt)
*/ */
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -816,13 +740,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33);
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _SHA1_ #define _SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 combs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{ {
@ -36,43 +36,20 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -89,39 +66,25 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -130,28 +93,28 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -162,28 +125,28 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 ($pass) * sha1 ($pass)
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -551,13 +514,7 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c ^= plain_sha1_c; c ^= plain_sha1_c;
d ^= plain_sha1_d; d ^= plain_sha1_d;
e ^= plain_sha1_e; e ^= plain_sha1_e;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -585,43 +542,20 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -650,39 +584,25 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -691,28 +611,28 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -723,28 +643,28 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 ($pass) * sha1 ($pass)
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -1112,13 +1032,7 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
c ^= plain_sha1_c; c ^= plain_sha1_c;
d ^= plain_sha1_d; d ^= plain_sha1_d;
e ^= plain_sha1_e; e ^= plain_sha1_e;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = d;
const u32 r1 = e;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _MD5_ #define _MD5_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_lower8(i) l_bin2asc[(i)] #define uint_to_hex_lower8(i) l_bin2asc[(i)]
@ -776,43 +776,20 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -927,67 +904,53 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -1044,28 +1007,28 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = block0[ 0]; w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1]; w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2]; w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3]; w0_t[3] = block0[ 3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = block0[ 4]; w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5]; w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6]; w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7]; w1_t[3] = block0[ 7];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = block0[ 8]; w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9]; w2_t[1] = block0[ 9];
w2_t[2] = block0[10]; w2_t[2] = block0[10];
w2_t[3] = block0[11]; w2_t[3] = block0[11];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = block0[12]; w3_t[0] = block0[12];
w3_t[1] = block0[13]; w3_t[1] = block0[13];
@ -1079,10 +1042,10 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// md5 // md5
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -1584,13 +1547,7 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_M_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_M
} }
} }
@ -1635,43 +1592,20 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -1798,67 +1732,53 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -1915,28 +1835,28 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); block_len = memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len);
u32 w0_t[4]; u32x w0_t[4];
w0_t[0] = block0[ 0]; w0_t[0] = block0[ 0];
w0_t[1] = block0[ 1]; w0_t[1] = block0[ 1];
w0_t[2] = block0[ 2]; w0_t[2] = block0[ 2];
w0_t[3] = block0[ 3]; w0_t[3] = block0[ 3];
u32 w1_t[4]; u32x w1_t[4];
w1_t[0] = block0[ 4]; w1_t[0] = block0[ 4];
w1_t[1] = block0[ 5]; w1_t[1] = block0[ 5];
w1_t[2] = block0[ 6]; w1_t[2] = block0[ 6];
w1_t[3] = block0[ 7]; w1_t[3] = block0[ 7];
u32 w2_t[4]; u32x w2_t[4];
w2_t[0] = block0[ 8]; w2_t[0] = block0[ 8];
w2_t[1] = block0[ 9]; w2_t[1] = block0[ 9];
w2_t[2] = block0[10]; w2_t[2] = block0[10];
w2_t[3] = block0[11]; w2_t[3] = block0[11];
u32 w3_t[4]; u32x w3_t[4];
w3_t[0] = block0[12]; w3_t[0] = block0[12];
w3_t[1] = block0[13]; w3_t[1] = block0[13];
@ -1950,10 +1870,10 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
// md5 // md5
u32 a = MD5M_A; u32x a = MD5M_A;
u32 b = MD5M_B; u32x b = MD5M_B;
u32 c = MD5M_C; u32x c = MD5M_C;
u32 d = MD5M_D; u32x d = MD5M_D;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01);
@ -2455,13 +2375,7 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
b += r_b; b += r_b;
c += r_c; c += r_c;
d += r_d; d += r_d;
COMPARE_S_SIMD (a, d, c, b);
const u32 r0 = a;
const u32 r1 = d;
const u32 r2 = c;
const u32 r3 = b;
#include COMPARE_S
} }
} }

View File

@ -5,6 +5,8 @@
#define _CRC32_ #define _CRC32_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
__constant u32 crc32tab[0x100] = __constant u32 crc32tab[0x100] =
{ {
@ -143,85 +143,50 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 0] = wordl0[0] | wordr0[0];
w_t[ 1] = wordl0[1] | wordr0[1]; w_t[ 1] = wordl0[1] | wordr0[1];
@ -240,8 +205,8 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w_t[14] = wordl3[2] | wordr3[2]; w_t[14] = wordl3[2] | wordr3[2];
w_t[15] = 0; w_t[15] = 0;
u32 a = crc32 (w_t, pw_len, iv); u32x a = crc32 (w_t, pw_len, iv);
u32 b = 0; u32x b = 0;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;
@ -276,41 +241,20 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
if (gid >= gid_max) return; if (gid >= gid_max) return;
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* digest * digest
*/ */
@ -329,46 +273,32 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w_t[16]; u32x w_t[16];
w_t[ 0] = wordl0[0] | wordr0[0]; w_t[ 0] = wordl0[0] | wordr0[0];
w_t[ 1] = wordl0[1] | wordr0[1]; w_t[ 1] = wordl0[1] | wordr0[1];
@ -387,8 +317,8 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
w_t[14] = wordl3[2] | wordr3[2]; w_t[14] = wordl3[2] | wordr3[2];
w_t[15] = 0; w_t[15] = 0;
u32 a = crc32 (w_t, pw_len, iv); u32x a = crc32 (w_t, pw_len, iv);
u32 b = 0; u32x b = 0;
const u32 r0 = a; const u32 r0 = a;
const u32 r1 = b; const u32 r1 = b;

View File

@ -7,6 +7,8 @@
#define _GOST2012_256_ #define _GOST2012_256_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define INITVAL 0x0101010101010101 #define INITVAL 0x0101010101010101
@ -2320,87 +2320,50 @@ __kernel void m11700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -2539,87 +2502,50 @@ __kernel void m11700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];

View File

@ -7,6 +7,8 @@
#define _GOST2012_512_ #define _GOST2012_512_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -18,9 +20,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define INITVAL 0 #define INITVAL 0
@ -2320,87 +2320,50 @@ __kernel void m11800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];
@ -2539,87 +2502,50 @@ __kernel void m11800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w[16]; u32x w[16];
w[ 0] = wordl0[0] | wordr0[0]; w[ 0] = wordl0[0] | wordr0[0];
w[ 1] = wordl0[1] | wordr0[1]; w[ 1] = wordl0[1] | wordr0[1];

View File

@ -5,6 +5,8 @@
#define _SHA256_SHA1_ #define _SHA256_SHA1_
#define NEW_SIMD_CODE
#include "include/constants.h" #include "include/constants.h"
#include "include/kernel_vendor.h" #include "include/kernel_vendor.h"
@ -16,9 +18,7 @@
#include "include/kernel_functions.c" #include "include/kernel_functions.c"
#include "OpenCL/types_ocl.c" #include "OpenCL/types_ocl.c"
#include "OpenCL/common.c" #include "OpenCL/common.c"
#include "OpenCL/simd.c"
#define COMPARE_S "OpenCL/check_single_comp4.c"
#define COMPARE_M "OpenCL/check_multi_comp4.c"
#define uint_to_hex_upper8(i) l_bin2asc[(i)] #define uint_to_hex_upper8(i) l_bin2asc[(i)]
@ -55,43 +55,20 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -111,39 +88,25 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -152,28 +115,28 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -184,31 +147,31 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
u32 f = 0; u32x f = 0;
u32 g = 0; u32x g = 0;
u32 h = 0; u32x h = 0;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -431,13 +394,7 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_M_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_M
} }
} }
@ -482,43 +439,20 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* base * base
*/ */
u32 wordl0[4]; u32 pws0[4] = { 0 };
u32 pws1[4] = { 0 };
wordl0[0] = pws[gid].i[ 0]; pws0[0] = pws[gid].i[0];
wordl0[1] = pws[gid].i[ 1]; pws0[1] = pws[gid].i[1];
wordl0[2] = pws[gid].i[ 2]; pws0[2] = pws[gid].i[2];
wordl0[3] = pws[gid].i[ 3]; pws0[3] = pws[gid].i[3];
pws1[0] = pws[gid].i[4];
u32 wordl1[4]; pws1[1] = pws[gid].i[5];
pws1[2] = pws[gid].i[6];
wordl1[0] = pws[gid].i[ 4]; pws1[3] = pws[gid].i[7];
wordl1[1] = pws[gid].i[ 5];
wordl1[2] = pws[gid].i[ 6];
wordl1[3] = pws[gid].i[ 7];
u32 wordl2[4];
wordl2[0] = 0;
wordl2[1] = 0;
wordl2[2] = 0;
wordl2[3] = 0;
u32 wordl3[4];
wordl3[0] = 0;
wordl3[1] = 0;
wordl3[2] = 0;
wordl3[3] = 0;
const u32 pw_l_len = pws[gid].pw_len; const u32 pw_l_len = pws[gid].pw_len;
if (combs_mode == COMBINATOR_MODE_BASE_RIGHT)
{
append_0x80_2x4 (wordl0, wordl1, pw_l_len);
switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len);
}
/** /**
* salt * salt
*/ */
@ -550,39 +484,25 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* loop * loop
*/ */
for (u32 il_pos = 0; il_pos < combs_cnt; il_pos++) for (u32 il_pos = 0; il_pos < combs_cnt; il_pos += VECT_SIZE)
{ {
const u32 pw_r_len = combs_buf[il_pos].pw_len; const u32x pw_r_len = pwlenx_create_combt (combs_buf, il_pos);
const u32 pw_len = pw_l_len + pw_r_len; const u32x pw_len = pw_l_len + pw_r_len;
u32 wordr0[4]; u32x wordr0[4] = { 0 };
u32x wordr1[4] = { 0 };
u32x wordr2[4] = { 0 };
u32x wordr3[4] = { 0 };
wordr0[0] = combs_buf[il_pos].i[0]; wordr0[0] = ix_create_combt (combs_buf, il_pos, 0);
wordr0[1] = combs_buf[il_pos].i[1]; wordr0[1] = ix_create_combt (combs_buf, il_pos, 1);
wordr0[2] = combs_buf[il_pos].i[2]; wordr0[2] = ix_create_combt (combs_buf, il_pos, 2);
wordr0[3] = combs_buf[il_pos].i[3]; wordr0[3] = ix_create_combt (combs_buf, il_pos, 3);
wordr1[0] = ix_create_combt (combs_buf, il_pos, 4);
u32 wordr1[4]; wordr1[1] = ix_create_combt (combs_buf, il_pos, 5);
wordr1[2] = ix_create_combt (combs_buf, il_pos, 6);
wordr1[0] = combs_buf[il_pos].i[4]; wordr1[3] = ix_create_combt (combs_buf, il_pos, 7);
wordr1[1] = combs_buf[il_pos].i[5];
wordr1[2] = combs_buf[il_pos].i[6];
wordr1[3] = combs_buf[il_pos].i[7];
u32 wordr2[4];
wordr2[0] = 0;
wordr2[1] = 0;
wordr2[2] = 0;
wordr2[3] = 0;
u32 wordr3[4];
wordr3[0] = 0;
wordr3[1] = 0;
wordr3[2] = 0;
wordr3[3] = 0;
if (combs_mode == COMBINATOR_MODE_BASE_LEFT) if (combs_mode == COMBINATOR_MODE_BASE_LEFT)
{ {
@ -591,28 +511,28 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len);
} }
u32 w0[4]; u32x w0[4];
w0[0] = wordl0[0] | wordr0[0]; w0[0] = wordl0[0] | wordr0[0];
w0[1] = wordl0[1] | wordr0[1]; w0[1] = wordl0[1] | wordr0[1];
w0[2] = wordl0[2] | wordr0[2]; w0[2] = wordl0[2] | wordr0[2];
w0[3] = wordl0[3] | wordr0[3]; w0[3] = wordl0[3] | wordr0[3];
u32 w1[4]; u32x w1[4];
w1[0] = wordl1[0] | wordr1[0]; w1[0] = wordl1[0] | wordr1[0];
w1[1] = wordl1[1] | wordr1[1]; w1[1] = wordl1[1] | wordr1[1];
w1[2] = wordl1[2] | wordr1[2]; w1[2] = wordl1[2] | wordr1[2];
w1[3] = wordl1[3] | wordr1[3]; w1[3] = wordl1[3] | wordr1[3];
u32 w2[4]; u32x w2[4];
w2[0] = wordl2[0] | wordr2[0]; w2[0] = wordl2[0] | wordr2[0];
w2[1] = wordl2[1] | wordr2[1]; w2[1] = wordl2[1] | wordr2[1];
w2[2] = wordl2[2] | wordr2[2]; w2[2] = wordl2[2] | wordr2[2];
w2[3] = wordl2[3] | wordr2[3]; w2[3] = wordl2[3] | wordr2[3];
u32 w3[4]; u32x w3[4];
w3[0] = wordl3[0] | wordr3[0]; w3[0] = wordl3[0] | wordr3[0];
w3[1] = wordl3[1] | wordr3[1]; w3[1] = wordl3[1] | wordr3[1];
@ -623,31 +543,31 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
* sha1 * sha1
*/ */
u32 w0_t = swap32 (w0[0]); u32x w0_t = swap32 (w0[0]);
u32 w1_t = swap32 (w0[1]); u32x w1_t = swap32 (w0[1]);
u32 w2_t = swap32 (w0[2]); u32x w2_t = swap32 (w0[2]);
u32 w3_t = swap32 (w0[3]); u32x w3_t = swap32 (w0[3]);
u32 w4_t = swap32 (w1[0]); u32x w4_t = swap32 (w1[0]);
u32 w5_t = swap32 (w1[1]); u32x w5_t = swap32 (w1[1]);
u32 w6_t = swap32 (w1[2]); u32x w6_t = swap32 (w1[2]);
u32 w7_t = swap32 (w1[3]); u32x w7_t = swap32 (w1[3]);
u32 w8_t = swap32 (w2[0]); u32x w8_t = swap32 (w2[0]);
u32 w9_t = swap32 (w2[1]); u32x w9_t = swap32 (w2[1]);
u32 wa_t = swap32 (w2[2]); u32x wa_t = swap32 (w2[2]);
u32 wb_t = swap32 (w2[3]); u32x wb_t = swap32 (w2[3]);
u32 wc_t = swap32 (w3[0]); u32x wc_t = swap32 (w3[0]);
u32 wd_t = swap32 (w3[1]); u32x wd_t = swap32 (w3[1]);
u32 we_t = 0; u32x we_t = 0;
u32 wf_t = pw_len * 8; u32x wf_t = pw_len * 8;
u32 a = SHA1M_A; u32x a = SHA1M_A;
u32 b = SHA1M_B; u32x b = SHA1M_B;
u32 c = SHA1M_C; u32x c = SHA1M_C;
u32 d = SHA1M_D; u32x d = SHA1M_D;
u32 e = SHA1M_E; u32x e = SHA1M_E;
u32 f = 0; u32x f = 0;
u32 g = 0; u32x g = 0;
u32 h = 0; u32x h = 0;
#undef K #undef K
#define K SHA1C00 #define K SHA1C00
@ -870,13 +790,7 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf,
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d);
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e);
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f);
COMPARE_S_SIMD (d, h, c, g);
const u32 r0 = d;
const u32 r1 = h;
const u32 r2 = c;
const u32 r3 = g;
#include COMPARE_S
} }
} }

View File

@ -4494,6 +4494,8 @@ static void *thread_calc_stdin (void *p)
run_cracker (device_param, pws_cnt); run_cracker (device_param, pws_cnt);
device_param->pws_cnt = 0; device_param->pws_cnt = 0;
memset (device_param->pws_buf, 0, device_param->size_pws);
} }
} }
@ -4753,6 +4755,8 @@ static void *thread_calc (void *p)
run_cracker (device_param, pws_cnt); run_cracker (device_param, pws_cnt);
device_param->pws_cnt = 0; device_param->pws_cnt = 0;
memset (device_param->pws_buf, 0, device_param->size_pws);
} }
if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint (); if (data.devices_status == STATUS_STOP_AT_CHECKPOINT) check_checkpoint ();