mirror of
https://github.com/hashcat/hashcat.git
synced 2024-11-22 08:08:10 +00:00
SIMD for slow hashes prototype
This commit is contained in:
parent
74fa4c1886
commit
9b3d18f87d
310
OpenCL/m00400.cl
310
OpenCL/m00400.cl
@ -5,6 +5,8 @@
|
||||
|
||||
#define _MD5_
|
||||
|
||||
#define NEW_SIMD_CODE
|
||||
|
||||
#include "include/constants.h"
|
||||
#include "include/kernel_vendor.h"
|
||||
|
||||
@ -16,11 +18,12 @@
|
||||
#include "include/kernel_functions.c"
|
||||
#include "OpenCL/types_ocl.c"
|
||||
#include "OpenCL/common.c"
|
||||
#include "OpenCL/simd.c"
|
||||
|
||||
#define COMPARE_S "OpenCL/check_single_comp4.c"
|
||||
#define COMPARE_M "OpenCL/check_multi_comp4.c"
|
||||
|
||||
static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
|
||||
static void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
|
||||
{
|
||||
u32 a = digest[0];
|
||||
u32 b = digest[1];
|
||||
@ -44,6 +47,104 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co
|
||||
u32 we_t = w3[2];
|
||||
u32 wf_t = 0;
|
||||
|
||||
MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
|
||||
MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
|
||||
MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
|
||||
MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
|
||||
MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
|
||||
MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
|
||||
MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
|
||||
MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
|
||||
MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
|
||||
MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
|
||||
MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
|
||||
MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
|
||||
MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
|
||||
MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
|
||||
MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
|
||||
MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
|
||||
|
||||
MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
|
||||
MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
|
||||
MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
|
||||
MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
|
||||
MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
|
||||
MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
|
||||
MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
|
||||
MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
|
||||
MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
|
||||
MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
|
||||
MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
|
||||
MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
|
||||
MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
|
||||
MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
|
||||
MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
|
||||
MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
|
||||
|
||||
MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
|
||||
MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
|
||||
MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
|
||||
MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
|
||||
MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
|
||||
MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
|
||||
MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
|
||||
MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
|
||||
MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
|
||||
MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
|
||||
MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
|
||||
MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
|
||||
MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
|
||||
MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
|
||||
MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
|
||||
MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
|
||||
|
||||
MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
|
||||
MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
|
||||
MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
|
||||
MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
|
||||
MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
|
||||
MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
|
||||
MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
|
||||
MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
|
||||
MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
|
||||
MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
|
||||
MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
|
||||
MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
|
||||
MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
|
||||
MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
|
||||
MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
|
||||
MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
|
||||
|
||||
digest[0] += a;
|
||||
digest[1] += b;
|
||||
digest[2] += c;
|
||||
digest[3] += d;
|
||||
}
|
||||
|
||||
static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
|
||||
{
|
||||
u32x a = digest[0];
|
||||
u32x b = digest[1];
|
||||
u32x c = digest[2];
|
||||
u32x d = digest[3];
|
||||
|
||||
u32x w0_t = w0[0];
|
||||
u32x w1_t = w0[1];
|
||||
u32x w2_t = w0[2];
|
||||
u32x w3_t = w0[3];
|
||||
u32x w4_t = w1[0];
|
||||
u32x w5_t = w1[1];
|
||||
u32x w6_t = w1[2];
|
||||
u32x w7_t = w1[3];
|
||||
u32x w8_t = w2[0];
|
||||
u32x w9_t = w2[1];
|
||||
u32x wa_t = w2[2];
|
||||
u32x wb_t = w2[3];
|
||||
u32x wc_t = w3[0];
|
||||
u32x wd_t = w3[1];
|
||||
u32x we_t = w3[2];
|
||||
u32x wf_t = 0;
|
||||
|
||||
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
|
||||
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
|
||||
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
|
||||
@ -194,7 +295,7 @@ __kernel void m00400_init (__global pw_t *pws, __global kernel_rule_t *rules_buf
|
||||
block3[2] = block_len * 8;
|
||||
block3[3] = 0;
|
||||
|
||||
append_0x80_4x4 (block0, block1, block2, block3, block_len);
|
||||
append_0x80_4x4_S (block0, block1, block2, block3, block_len);
|
||||
|
||||
/**
|
||||
* init
|
||||
@ -207,7 +308,7 @@ __kernel void m00400_init (__global pw_t *pws, __global kernel_rule_t *rules_buf
|
||||
digest[2] = MD5M_C;
|
||||
digest[3] = MD5M_D;
|
||||
|
||||
md5_transform (block0, block1, block2, block3, digest);
|
||||
md5_transform_S (block0, block1, block2, block3, digest);
|
||||
|
||||
tmps[gid].digest_buf[0] = digest[0];
|
||||
tmps[gid].digest_buf[1] = digest[1];
|
||||
@ -225,75 +326,160 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
u32 w0[4];
|
||||
u32x w0[4] = { 0 };
|
||||
u32x w1[4] = { 0 };
|
||||
u32x w2[4] = { 0 };
|
||||
|
||||
w0[0] = pws[gid].i[ 0];
|
||||
w0[1] = pws[gid].i[ 1];
|
||||
w0[2] = pws[gid].i[ 2];
|
||||
w0[3] = pws[gid].i[ 3];
|
||||
u32x pw_len = 0;
|
||||
|
||||
u32 w1[4];
|
||||
u32x digest[4] = { 0 };
|
||||
|
||||
w1[0] = pws[gid].i[ 4];
|
||||
w1[1] = pws[gid].i[ 5];
|
||||
w1[2] = pws[gid].i[ 6];
|
||||
w1[3] = pws[gid].i[ 7];
|
||||
#if VECT_SIZE == 1
|
||||
|
||||
u32 w2[4];
|
||||
w0[0] = pws[gid].i[0];
|
||||
w0[1] = pws[gid].i[1];
|
||||
w0[2] = pws[gid].i[2];
|
||||
w0[3] = pws[gid].i[3];
|
||||
w1[0] = pws[gid].i[4];
|
||||
w1[1] = pws[gid].i[5];
|
||||
w1[2] = pws[gid].i[6];
|
||||
w1[3] = pws[gid].i[7];
|
||||
w2[0] = pws[gid].i[8];
|
||||
w2[1] = pws[gid].i[9];
|
||||
|
||||
w2[0] = pws[gid].i[ 8];
|
||||
w2[1] = pws[gid].i[ 9];
|
||||
w2[2] = 0;
|
||||
w2[3] = 0;
|
||||
|
||||
const u32 pw_len = pws[gid].pw_len;
|
||||
|
||||
/**
|
||||
* digest
|
||||
*/
|
||||
|
||||
u32 digest[4];
|
||||
pw_len = pws[gid].pw_len;
|
||||
|
||||
digest[0] = tmps[gid].digest_buf[0];
|
||||
digest[1] = tmps[gid].digest_buf[1];
|
||||
digest[2] = tmps[gid].digest_buf[2];
|
||||
digest[3] = tmps[gid].digest_buf[3];
|
||||
|
||||
#else
|
||||
|
||||
const u32 gidx = gid * VECT_SIZE;
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
|
||||
if ((gidx + 0) < gid_max)
|
||||
{
|
||||
w0[0].s0 = pws[gidx + 0].i[0];
|
||||
w0[1].s0 = pws[gidx + 0].i[1];
|
||||
w0[2].s0 = pws[gidx + 0].i[2];
|
||||
w0[3].s0 = pws[gidx + 0].i[3];
|
||||
w1[0].s0 = pws[gidx + 0].i[4];
|
||||
w1[1].s0 = pws[gidx + 0].i[5];
|
||||
w1[2].s0 = pws[gidx + 0].i[6];
|
||||
w1[3].s0 = pws[gidx + 0].i[7];
|
||||
w2[0].s0 = pws[gidx + 0].i[8];
|
||||
w2[1].s0 = pws[gidx + 0].i[9];
|
||||
|
||||
pw_len.s0 = pws[gidx + 0].pw_len;
|
||||
|
||||
digest[0].s0 = tmps[gidx + 0].digest_buf[0];
|
||||
digest[1].s0 = tmps[gidx + 0].digest_buf[1];
|
||||
digest[2].s0 = tmps[gidx + 0].digest_buf[2];
|
||||
digest[3].s0 = tmps[gidx + 0].digest_buf[3];
|
||||
}
|
||||
|
||||
if ((gidx + 1) < gid_max)
|
||||
{
|
||||
w0[0].s1 = pws[gidx + 1].i[0];
|
||||
w0[1].s1 = pws[gidx + 1].i[1];
|
||||
w0[2].s1 = pws[gidx + 1].i[2];
|
||||
w0[3].s1 = pws[gidx + 1].i[3];
|
||||
w1[0].s1 = pws[gidx + 1].i[4];
|
||||
w1[1].s1 = pws[gidx + 1].i[5];
|
||||
w1[2].s1 = pws[gidx + 1].i[6];
|
||||
w1[3].s1 = pws[gidx + 1].i[7];
|
||||
w2[0].s1 = pws[gidx + 1].i[8];
|
||||
w2[1].s1 = pws[gidx + 1].i[9];
|
||||
|
||||
pw_len.s1 = pws[gidx + 1].pw_len;
|
||||
|
||||
digest[0].s1 = tmps[gidx + 1].digest_buf[0];
|
||||
digest[1].s1 = tmps[gidx + 1].digest_buf[1];
|
||||
digest[2].s1 = tmps[gidx + 1].digest_buf[2];
|
||||
digest[3].s1 = tmps[gidx + 1].digest_buf[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
|
||||
if ((gidx + 2) < gid_max)
|
||||
{
|
||||
w0[0].s2 = pws[gidx + 2].i[0];
|
||||
w0[1].s2 = pws[gidx + 2].i[1];
|
||||
w0[2].s2 = pws[gidx + 2].i[2];
|
||||
w0[3].s2 = pws[gidx + 2].i[3];
|
||||
w1[0].s2 = pws[gidx + 2].i[4];
|
||||
w1[1].s2 = pws[gidx + 2].i[5];
|
||||
w1[2].s2 = pws[gidx + 2].i[6];
|
||||
w1[3].s2 = pws[gidx + 2].i[7];
|
||||
w2[0].s2 = pws[gidx + 2].i[8];
|
||||
w2[1].s2 = pws[gidx + 2].i[9];
|
||||
|
||||
pw_len.s2 = pws[gidx + 2].pw_len;
|
||||
|
||||
digest[0].s2 = tmps[gidx + 2].digest_buf[0];
|
||||
digest[1].s2 = tmps[gidx + 2].digest_buf[1];
|
||||
digest[2].s2 = tmps[gidx + 2].digest_buf[2];
|
||||
digest[3].s2 = tmps[gidx + 2].digest_buf[3];
|
||||
}
|
||||
|
||||
if ((gidx + 3) < gid_max)
|
||||
{
|
||||
w0[0].s3 = pws[gidx + 3].i[0];
|
||||
w0[1].s3 = pws[gidx + 3].i[1];
|
||||
w0[2].s3 = pws[gidx + 3].i[2];
|
||||
w0[3].s3 = pws[gidx + 3].i[3];
|
||||
w1[0].s3 = pws[gidx + 3].i[4];
|
||||
w1[1].s3 = pws[gidx + 3].i[5];
|
||||
w1[2].s3 = pws[gidx + 3].i[6];
|
||||
w1[3].s3 = pws[gidx + 3].i[7];
|
||||
w2[0].s3 = pws[gidx + 3].i[8];
|
||||
w2[1].s3 = pws[gidx + 3].i[9];
|
||||
|
||||
pw_len.s3 = pws[gidx + 3].pw_len;
|
||||
|
||||
digest[0].s3 = tmps[gidx + 3].digest_buf[0];
|
||||
digest[1].s3 = tmps[gidx + 3].digest_buf[1];
|
||||
digest[2].s3 = tmps[gidx + 3].digest_buf[2];
|
||||
digest[3].s3 = tmps[gidx + 3].digest_buf[3];
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* loop
|
||||
*/
|
||||
|
||||
u32 block_len = (16 + pw_len);
|
||||
u32x block_len = (16 + pw_len);
|
||||
|
||||
u32 block0[4];
|
||||
u32x block0[4];
|
||||
u32x block1[4];
|
||||
u32x block2[4];
|
||||
u32x block3[4];
|
||||
|
||||
block0[0] = 0;
|
||||
block0[1] = 0;
|
||||
block0[2] = 0;
|
||||
block0[3] = 0;
|
||||
|
||||
u32 block1[4];
|
||||
|
||||
block1[0] = w0[0];
|
||||
block1[1] = w0[1];
|
||||
block1[2] = w0[2];
|
||||
block1[3] = w0[3];
|
||||
|
||||
u32 block2[4];
|
||||
|
||||
block2[0] = w1[0];
|
||||
block2[1] = w1[1];
|
||||
block2[2] = w1[2];
|
||||
block2[3] = w1[3];
|
||||
|
||||
u32 block3[4];
|
||||
|
||||
block3[0] = w2[0];
|
||||
block3[1] = w2[1];
|
||||
block3[2] = block_len * 8;
|
||||
block3[3] = 0;
|
||||
|
||||
append_0x80_4x4 (block0, block1, block2, block3, block_len);
|
||||
append_0x80_4x4_VV (block0, block1, block2, block3, block_len);
|
||||
|
||||
/**
|
||||
* init
|
||||
@ -314,10 +500,55 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf
|
||||
md5_transform (block0, block1, block2, block3, digest);
|
||||
}
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
|
||||
tmps[gid].digest_buf[0] = digest[0];
|
||||
tmps[gid].digest_buf[1] = digest[1];
|
||||
tmps[gid].digest_buf[2] = digest[2];
|
||||
tmps[gid].digest_buf[3] = digest[3];
|
||||
|
||||
#else
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
|
||||
if ((gidx + 0) < gid_max)
|
||||
{
|
||||
tmps[gidx + 0].digest_buf[0] = digest[0].s0;
|
||||
tmps[gidx + 0].digest_buf[1] = digest[1].s0;
|
||||
tmps[gidx + 0].digest_buf[2] = digest[2].s0;
|
||||
tmps[gidx + 0].digest_buf[3] = digest[3].s0;
|
||||
}
|
||||
|
||||
if ((gidx + 1) < gid_max)
|
||||
{
|
||||
tmps[gidx + 1].digest_buf[0] = digest[0].s1;
|
||||
tmps[gidx + 1].digest_buf[1] = digest[1].s1;
|
||||
tmps[gidx + 1].digest_buf[2] = digest[2].s1;
|
||||
tmps[gidx + 1].digest_buf[3] = digest[3].s1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
|
||||
if ((gidx + 2) < gid_max)
|
||||
{
|
||||
tmps[gidx + 2].digest_buf[0] = digest[0].s2;
|
||||
tmps[gidx + 2].digest_buf[1] = digest[1].s2;
|
||||
tmps[gidx + 2].digest_buf[2] = digest[2].s2;
|
||||
tmps[gidx + 2].digest_buf[3] = digest[3].s2;
|
||||
}
|
||||
|
||||
if ((gidx + 3) < gid_max)
|
||||
{
|
||||
tmps[gidx + 3].digest_buf[0] = digest[0].s3;
|
||||
tmps[gidx + 3].digest_buf[1] = digest[1].s3;
|
||||
tmps[gidx + 3].digest_buf[2] = digest[2].s3;
|
||||
tmps[gidx + 3].digest_buf[3] = digest[3].s3;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void m00400_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
|
||||
@ -327,11 +558,10 @@ __kernel void m00400_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf
|
||||
*/
|
||||
|
||||
const u32 gid = get_global_id (0);
|
||||
const u32 lid = get_local_id (0);
|
||||
|
||||
if (gid >= gid_max) return;
|
||||
|
||||
const u32 lid = get_local_id (0);
|
||||
|
||||
/**
|
||||
* digest
|
||||
*/
|
||||
|
@ -351,71 +351,6 @@ static inline u64 rotl64_S (const u64 a, const u32 n)
|
||||
return rotr64_S (a, 64 - n);
|
||||
}
|
||||
|
||||
#if CUDA_ARCH >= 500
|
||||
static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
|
||||
{
|
||||
u32 r;
|
||||
@ -850,281 +785,6 @@ static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CUDA_ARCH >= 500
|
||||
static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c)
|
||||
{
|
||||
u32x r;
|
||||
|
||||
#if VECT_SIZE == 1
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 2
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 4
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 8
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7));
|
||||
#endif
|
||||
|
||||
#if VECT_SIZE >= 16
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se));
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf));
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef IS_GENERIC
|
||||
|
@ -71,6 +71,7 @@ ALIAS_nv_budget * 21 2 A
|
||||
ALIAS_nv_budget * 22 2 A A
|
||||
ALIAS_nv_budget * 23 2 A A
|
||||
ALIAS_nv_budget * 200 2 A A
|
||||
ALIAS_nv_budget * 400 2 A A
|
||||
ALIAS_nv_budget * 900 2 A A
|
||||
ALIAS_nv_budget * 1000 2 A A
|
||||
ALIAS_nv_budget * 1100 2 A A
|
||||
|
@ -123,4 +123,6 @@ void log_error (const char *fmt, ...);
|
||||
#define MIN(a,b) (((a) < (b)) ? (a) : (b))
|
||||
#define MAX(a,b) (((a) > (b)) ? (a) : (b))
|
||||
|
||||
#define CEIL(a) ((a - (int) (a)) > 0 ? a + 1 : a)
|
||||
|
||||
#endif // COMMON_H
|
||||
|
@ -10,15 +10,9 @@
|
||||
#define MD4_H_S(x,y,z) ((x) ^ (y) ^ (z))
|
||||
|
||||
#ifdef IS_NV
|
||||
#if CUDA_ARCH >= 500
|
||||
#define MD4_F(x,y,z) lut3_ca ((x), (y), (z))
|
||||
#define MD4_G(x,y,z) lut3_e8 ((x), (y), (z))
|
||||
#define MD4_H(x,y,z) lut3_96 ((x), (y), (z))
|
||||
#else
|
||||
#define MD4_F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
|
||||
#define MD4_G(x,y,z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
|
||||
#define MD4_H(x,y,z) ((x) ^ (y) ^ (z))
|
||||
#endif
|
||||
#define MD4_Fo(x,y,z) (MD4_F((x), (y), (z)))
|
||||
#define MD4_Go(x,y,z) (MD4_G((x), (y), (z)))
|
||||
#endif
|
||||
@ -64,17 +58,10 @@
|
||||
#define MD5_I_S(x,y,z) ((y) ^ ((x) | ~(z)))
|
||||
|
||||
#ifdef IS_NV
|
||||
#if CUDA_ARCH >= 500
|
||||
#define MD5_F(x,y,z) lut3_ca ((x), (y), (z))
|
||||
#define MD5_G(x,y,z) lut3_e4 ((x), (y), (z))
|
||||
#define MD5_H(x,y,z) lut3_96 ((x), (y), (z))
|
||||
#define MD5_I(x,y,z) lut3_39 ((x), (y), (z))
|
||||
#else
|
||||
#define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
|
||||
#define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y))))
|
||||
#define MD5_H(x,y,z) ((x) ^ (y) ^ (z))
|
||||
#define MD5_I(x,y,z) ((y) ^ ((x) | ~(z)))
|
||||
#endif
|
||||
#define MD5_Fo(x,y,z) (MD5_F((x), (y), (z)))
|
||||
#define MD5_Go(x,y,z) (MD5_G((x), (y), (z)))
|
||||
#endif
|
||||
@ -97,6 +84,15 @@
|
||||
#define MD5_Go(x,y,z) (MD5_G((x), (y), (z)))
|
||||
#endif
|
||||
|
||||
#define MD5_STEP_S(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
a += x; \
|
||||
a += f (b, c, d); \
|
||||
a = rotl32_S (a, s); \
|
||||
a += b; \
|
||||
}
|
||||
|
||||
#define MD5_STEP(f,a,b,c,d,x,K,s) \
|
||||
{ \
|
||||
a += K; \
|
||||
@ -118,15 +114,9 @@
|
||||
#if defined _SHA1_ || defined _SAPG_ || defined _OFFICE2007_ || defined _OFFICE2010_ || defined _OLDOFFICE34_ || defined _ANDROIDFDE_ || defined _DCC2_ || defined _WPA_ || defined _MD5_SHA1_ || defined _SHA1_MD5_ || defined _PSAFE2_ || defined _LOTUS8_ || defined _PBKDF2_SHA1_ || defined _RAR3_ || defined _SHA256_SHA1_
|
||||
|
||||
#ifdef IS_NV
|
||||
#if CUDA_ARCH >= 500
|
||||
#define SHA1_F0(x,y,z) lut3_ca ((x), (y), (z))
|
||||
#define SHA1_F1(x,y,z) lut3_96 ((x), (y), (z))
|
||||
#define SHA1_F2(x,y,z) lut3_e8 ((x), (y), (z))
|
||||
#else
|
||||
#define SHA1_F0(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
|
||||
#define SHA1_F1(x,y,z) ((x) ^ (y) ^ (z))
|
||||
#define SHA1_F2(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
|
||||
#endif
|
||||
#define SHA1_F0o(x,y,z) (SHA1_F0 ((x), (y), (z)))
|
||||
#define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
|
||||
#endif
|
||||
@ -196,13 +186,8 @@
|
||||
#define SHA256_S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u))
|
||||
|
||||
#ifdef IS_NV
|
||||
#if CUDA_ARCH >= 500
|
||||
#define SHA256_F0(x,y,z) lut3_e8 ((x), (y), (z))
|
||||
#define SHA256_F1(x,y,z) lut3_ca ((x), (y), (z))
|
||||
#else
|
||||
#define SHA256_F0(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y))))
|
||||
#define SHA256_F1(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
|
||||
#endif
|
||||
#define SHA256_F0o(x,y,z) (SHA256_F0 ((x), (y), (z)))
|
||||
#define SHA256_F1o(x,y,z) (SHA256_F1 ((x), (y), (z)))
|
||||
#endif
|
||||
@ -321,19 +306,11 @@
|
||||
#ifdef _RIPEMD160_
|
||||
|
||||
#ifdef IS_NV
|
||||
#if CUDA_ARCH >= 500
|
||||
#define RIPEMD160_F(x,y,z) lut3_96 ((x), (y), (z))
|
||||
#define RIPEMD160_G(x,y,z) lut3_ca ((x), (y), (z))
|
||||
#define RIPEMD160_H(x,y,z) lut3_59 ((x), (y), (z))
|
||||
#define RIPEMD160_I(x,y,z) lut3_e4 ((x), (y), (z))
|
||||
#define RIPEMD160_J(x,y,z) lut3_2d ((x), (y), (z))
|
||||
#else
|
||||
#define RIPEMD160_F(x,y,z) ((x) ^ (y) ^ (z))
|
||||
#define RIPEMD160_G(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) /* x ? y : z */
|
||||
#define RIPEMD160_H(x,y,z) (((x) | ~(y)) ^ (z))
|
||||
#define RIPEMD160_I(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) /* z ? x : y */
|
||||
#define RIPEMD160_J(x,y,z) ((x) ^ ((y) | ~(z)))
|
||||
#endif
|
||||
#define RIPEMD160_Go(x,y,z) (RIPEMD160_G ((x), (y), (z)))
|
||||
#define RIPEMD160_Io(x,y,z) (RIPEMD160_I ((x), (y), (z)))
|
||||
#endif
|
||||
|
@ -1121,10 +1121,11 @@ extern hc_thread_mutex_t mux_display;
|
||||
#define OPTI_TYPE_SINGLE_SALT (1 << 12)
|
||||
#define OPTI_TYPE_BRUTE_FORCE (1 << 13)
|
||||
#define OPTI_TYPE_RAW_HASH (1 << 14)
|
||||
#define OPTI_TYPE_USES_BITS_8 (1 << 15)
|
||||
#define OPTI_TYPE_USES_BITS_16 (1 << 16)
|
||||
#define OPTI_TYPE_USES_BITS_32 (1 << 17)
|
||||
#define OPTI_TYPE_USES_BITS_64 (1 << 18)
|
||||
#define OPTI_TYPE_SLOW_HASH_SIMD (1 << 15)
|
||||
#define OPTI_TYPE_USES_BITS_8 (1 << 16)
|
||||
#define OPTI_TYPE_USES_BITS_16 (1 << 17)
|
||||
#define OPTI_TYPE_USES_BITS_32 (1 << 18)
|
||||
#define OPTI_TYPE_USES_BITS_64 (1 << 19)
|
||||
|
||||
#define OPTI_STR_ZERO_BYTE "Zero-Byte"
|
||||
#define OPTI_STR_PRECOMPUTE_INIT "Precompute-Init"
|
||||
@ -1140,6 +1141,7 @@ extern hc_thread_mutex_t mux_display;
|
||||
#define OPTI_STR_SINGLE_SALT "Single-Salt"
|
||||
#define OPTI_STR_BRUTE_FORCE "Brute-Force"
|
||||
#define OPTI_STR_RAW_HASH "Raw-Hash"
|
||||
#define OPTI_STR_SLOW_HASH_SIMD "Slow-Hash-SIMD"
|
||||
#define OPTI_STR_USES_BITS_8 "Uses-8-Bit"
|
||||
#define OPTI_STR_USES_BITS_16 "Uses-16-Bit"
|
||||
#define OPTI_STR_USES_BITS_32 "Uses-32-Bit"
|
||||
|
@ -2424,8 +2424,18 @@ static void run_kernel (const uint kern_run, hc_device_param_t *device_param, co
|
||||
|
||||
hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL);
|
||||
|
||||
if (kern_run == KERN_RUN_2)
|
||||
{
|
||||
if (data.opti_type & OPTI_TYPE_SLOW_HASH_SIMD)
|
||||
{
|
||||
num_elements = CEIL ((float) num_elements / device_param->vector_width);
|
||||
}
|
||||
}
|
||||
|
||||
if (kernel_threads > workgroup_size) kernel_threads = workgroup_size;
|
||||
|
||||
while (num_elements % kernel_threads) num_elements++;
|
||||
|
||||
const size_t global_work_size[3] = { num_elements, 1, 1 };
|
||||
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
|
||||
|
||||
@ -7611,7 +7621,8 @@ int main (int argc, char **argv)
|
||||
dgst_size = DGST_SIZE_4_4;
|
||||
parse_func = phpass_parse_hash;
|
||||
sort_by_digest = sort_by_digest_4_4;
|
||||
opti_type = OPTI_TYPE_ZERO_BYTE;
|
||||
opti_type = OPTI_TYPE_ZERO_BYTE
|
||||
| OPTI_TYPE_SLOW_HASH_SIMD;
|
||||
dgst_pos0 = 0;
|
||||
dgst_pos1 = 1;
|
||||
dgst_pos2 = 2;
|
||||
|
@ -5632,6 +5632,7 @@ char *stroptitype (const uint opti_type)
|
||||
case OPTI_TYPE_SINGLE_SALT: return ((char *) OPTI_STR_SINGLE_SALT); break;
|
||||
case OPTI_TYPE_BRUTE_FORCE: return ((char *) OPTI_STR_BRUTE_FORCE); break;
|
||||
case OPTI_TYPE_RAW_HASH: return ((char *) OPTI_STR_RAW_HASH); break;
|
||||
case OPTI_TYPE_SLOW_HASH_SIMD: return ((char *) OPTI_STR_SLOW_HASH_SIMD); break;
|
||||
case OPTI_TYPE_USES_BITS_8: return ((char *) OPTI_STR_USES_BITS_8); break;
|
||||
case OPTI_TYPE_USES_BITS_16: return ((char *) OPTI_STR_USES_BITS_16); break;
|
||||
case OPTI_TYPE_USES_BITS_32: return ((char *) OPTI_STR_USES_BITS_32); break;
|
||||
|
Loading…
Reference in New Issue
Block a user