Added SIMD code for WPA/WPA2

pull/332/head^2
Jens Steube 8 years ago
parent c78d7568e6
commit 9d74f2958d

@ -6186,6 +6186,536 @@ inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
}
}
inline void append_0x01_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
{
switch (offset)
{
case 0:
w0[0] = 0x01;
break;
case 1:
w0[0] = w0[0] | 0x0100;
break;
case 2:
w0[0] = w0[0] | 0x010000;
break;
case 3:
w0[0] = w0[0] | 0x01000000;
break;
case 4:
w0[1] = 0x01;
break;
case 5:
w0[1] = w0[1] | 0x0100;
break;
case 6:
w0[1] = w0[1] | 0x010000;
break;
case 7:
w0[1] = w0[1] | 0x01000000;
break;
case 8:
w0[2] = 0x01;
break;
case 9:
w0[2] = w0[2] | 0x0100;
break;
case 10:
w0[2] = w0[2] | 0x010000;
break;
case 11:
w0[2] = w0[2] | 0x01000000;
break;
case 12:
w0[3] = 0x01;
break;
case 13:
w0[3] = w0[3] | 0x0100;
break;
case 14:
w0[3] = w0[3] | 0x010000;
break;
case 15:
w0[3] = w0[3] | 0x01000000;
break;
case 16:
w1[0] = 0x01;
break;
case 17:
w1[0] = w1[0] | 0x0100;
break;
case 18:
w1[0] = w1[0] | 0x010000;
break;
case 19:
w1[0] = w1[0] | 0x01000000;
break;
case 20:
w1[1] = 0x01;
break;
case 21:
w1[1] = w1[1] | 0x0100;
break;
case 22:
w1[1] = w1[1] | 0x010000;
break;
case 23:
w1[1] = w1[1] | 0x01000000;
break;
case 24:
w1[2] = 0x01;
break;
case 25:
w1[2] = w1[2] | 0x0100;
break;
case 26:
w1[2] = w1[2] | 0x010000;
break;
case 27:
w1[2] = w1[2] | 0x01000000;
break;
case 28:
w1[3] = 0x01;
break;
case 29:
w1[3] = w1[3] | 0x0100;
break;
case 30:
w1[3] = w1[3] | 0x010000;
break;
case 31:
w1[3] = w1[3] | 0x01000000;
break;
case 32:
w2[0] = 0x01;
break;
case 33:
w2[0] = w2[0] | 0x0100;
break;
case 34:
w2[0] = w2[0] | 0x010000;
break;
case 35:
w2[0] = w2[0] | 0x01000000;
break;
case 36:
w2[1] = 0x01;
break;
case 37:
w2[1] = w2[1] | 0x0100;
break;
case 38:
w2[1] = w2[1] | 0x010000;
break;
case 39:
w2[1] = w2[1] | 0x01000000;
break;
case 40:
w2[2] = 0x01;
break;
case 41:
w2[2] = w2[2] | 0x0100;
break;
case 42:
w2[2] = w2[2] | 0x010000;
break;
case 43:
w2[2] = w2[2] | 0x01000000;
break;
case 44:
w2[3] = 0x01;
break;
case 45:
w2[3] = w2[3] | 0x0100;
break;
case 46:
w2[3] = w2[3] | 0x010000;
break;
case 47:
w2[3] = w2[3] | 0x01000000;
break;
}
}
inline void append_0x02_2x4_S (u32 w0[4], u32 w1[4], const u32 offset)
{
switch (offset)
{
case 0:
w0[0] = 0x02;
break;
case 1:
w0[0] = w0[0] | 0x0200;
break;
case 2:
w0[0] = w0[0] | 0x020000;
break;
case 3:
w0[0] = w0[0] | 0x02000000;
break;
case 4:
w0[1] = 0x02;
break;
case 5:
w0[1] = w0[1] | 0x0200;
break;
case 6:
w0[1] = w0[1] | 0x020000;
break;
case 7:
w0[1] = w0[1] | 0x02000000;
break;
case 8:
w0[2] = 0x02;
break;
case 9:
w0[2] = w0[2] | 0x0200;
break;
case 10:
w0[2] = w0[2] | 0x020000;
break;
case 11:
w0[2] = w0[2] | 0x02000000;
break;
case 12:
w0[3] = 0x02;
break;
case 13:
w0[3] = w0[3] | 0x0200;
break;
case 14:
w0[3] = w0[3] | 0x020000;
break;
case 15:
w0[3] = w0[3] | 0x02000000;
break;
case 16:
w1[0] = 0x02;
break;
case 17:
w1[0] = w1[0] | 0x0200;
break;
case 18:
w1[0] = w1[0] | 0x020000;
break;
case 19:
w1[0] = w1[0] | 0x02000000;
break;
case 20:
w1[1] = 0x02;
break;
case 21:
w1[1] = w1[1] | 0x0200;
break;
case 22:
w1[1] = w1[1] | 0x020000;
break;
case 23:
w1[1] = w1[1] | 0x02000000;
break;
case 24:
w1[2] = 0x02;
break;
case 25:
w1[2] = w1[2] | 0x0200;
break;
case 26:
w1[2] = w1[2] | 0x020000;
break;
case 27:
w1[2] = w1[2] | 0x02000000;
break;
case 28:
w1[3] = 0x02;
break;
case 29:
w1[3] = w1[3] | 0x0200;
break;
case 30:
w1[3] = w1[3] | 0x020000;
break;
case 31:
w1[3] = w1[3] | 0x02000000;
break;
}
}
inline void append_0x02_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset)
{
switch (offset)
{
case 0:
w0[0] = 0x02;
break;
case 1:
w0[0] = w0[0] | 0x0200;
break;
case 2:
w0[0] = w0[0] | 0x020000;
break;
case 3:
w0[0] = w0[0] | 0x02000000;
break;
case 4:
w0[1] = 0x02;
break;
case 5:
w0[1] = w0[1] | 0x0200;
break;
case 6:
w0[1] = w0[1] | 0x020000;
break;
case 7:
w0[1] = w0[1] | 0x02000000;
break;
case 8:
w0[2] = 0x02;
break;
case 9:
w0[2] = w0[2] | 0x0200;
break;
case 10:
w0[2] = w0[2] | 0x020000;
break;
case 11:
w0[2] = w0[2] | 0x02000000;
break;
case 12:
w0[3] = 0x02;
break;
case 13:
w0[3] = w0[3] | 0x0200;
break;
case 14:
w0[3] = w0[3] | 0x020000;
break;
case 15:
w0[3] = w0[3] | 0x02000000;
break;
case 16:
w1[0] = 0x02;
break;
case 17:
w1[0] = w1[0] | 0x0200;
break;
case 18:
w1[0] = w1[0] | 0x020000;
break;
case 19:
w1[0] = w1[0] | 0x02000000;
break;
case 20:
w1[1] = 0x02;
break;
case 21:
w1[1] = w1[1] | 0x0200;
break;
case 22:
w1[1] = w1[1] | 0x020000;
break;
case 23:
w1[1] = w1[1] | 0x02000000;
break;
case 24:
w1[2] = 0x02;
break;
case 25:
w1[2] = w1[2] | 0x0200;
break;
case 26:
w1[2] = w1[2] | 0x020000;
break;
case 27:
w1[2] = w1[2] | 0x02000000;
break;
case 28:
w1[3] = 0x02;
break;
case 29:
w1[3] = w1[3] | 0x0200;
break;
case 30:
w1[3] = w1[3] | 0x020000;
break;
case 31:
w1[3] = w1[3] | 0x02000000;
break;
case 32:
w2[0] = 0x02;
break;
case 33:
w2[0] = w2[0] | 0x0200;
break;
case 34:
w2[0] = w2[0] | 0x020000;
break;
case 35:
w2[0] = w2[0] | 0x02000000;
break;
case 36:
w2[1] = 0x02;
break;
case 37:
w2[1] = w2[1] | 0x0200;
break;
case 38:
w2[1] = w2[1] | 0x020000;
break;
case 39:
w2[1] = w2[1] | 0x02000000;
break;
case 40:
w2[2] = 0x02;
break;
case 41:
w2[2] = w2[2] | 0x0200;
break;
case 42:
w2[2] = w2[2] | 0x020000;
break;
case 43:
w2[2] = w2[2] | 0x02000000;
break;
case 44:
w2[3] = 0x02;
break;
case 45:
w2[3] = w2[3] | 0x0200;
break;
case 46:
w2[3] = w2[3] | 0x020000;
break;
case 47:
w2[3] = w2[3] | 0x02000000;
break;
}
}
inline void append_0x80_1x4_S (u32 w0[4], const u32 offset)
{
switch (offset)

@ -448,6 +448,93 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf
digest[3].s3 = tmps[gidx + 3].digest_buf[3];
}
#endif
#if VECT_SIZE >= 8
if ((gidx + 4) < gid_max)
{
w0[0].s4 = pws[gidx + 4].i[0];
w0[1].s4 = pws[gidx + 4].i[1];
w0[2].s4 = pws[gidx + 4].i[2];
w0[3].s4 = pws[gidx + 4].i[3];
w1[0].s4 = pws[gidx + 4].i[4];
w1[1].s4 = pws[gidx + 4].i[5];
w1[2].s4 = pws[gidx + 4].i[6];
w1[3].s4 = pws[gidx + 4].i[7];
w2[0].s4 = pws[gidx + 4].i[8];
w2[1].s4 = pws[gidx + 4].i[9];
pw_len.s4 = pws[gidx + 4].pw_len;
digest[0].s4 = tmps[gidx + 4].digest_buf[0];
digest[1].s4 = tmps[gidx + 4].digest_buf[1];
digest[2].s4 = tmps[gidx + 4].digest_buf[2];
digest[3].s4 = tmps[gidx + 4].digest_buf[3];
}
if ((gidx + 5) < gid_max)
{
w0[0].s5 = pws[gidx + 5].i[0];
w0[1].s5 = pws[gidx + 5].i[1];
w0[2].s5 = pws[gidx + 5].i[2];
w0[3].s5 = pws[gidx + 5].i[3];
w1[0].s5 = pws[gidx + 5].i[4];
w1[1].s5 = pws[gidx + 5].i[5];
w1[2].s5 = pws[gidx + 5].i[6];
w1[3].s5 = pws[gidx + 5].i[7];
w2[0].s5 = pws[gidx + 5].i[8];
w2[1].s5 = pws[gidx + 5].i[9];
pw_len.s5 = pws[gidx + 5].pw_len;
digest[0].s5 = tmps[gidx + 5].digest_buf[0];
digest[1].s5 = tmps[gidx + 5].digest_buf[1];
digest[2].s5 = tmps[gidx + 5].digest_buf[2];
digest[3].s5 = tmps[gidx + 5].digest_buf[3];
}
if ((gidx + 6) < gid_max)
{
w0[0].s6 = pws[gidx + 6].i[0];
w0[1].s6 = pws[gidx + 6].i[1];
w0[2].s6 = pws[gidx + 6].i[2];
w0[3].s6 = pws[gidx + 6].i[3];
w1[0].s6 = pws[gidx + 6].i[4];
w1[1].s6 = pws[gidx + 6].i[5];
w1[2].s6 = pws[gidx + 6].i[6];
w1[3].s6 = pws[gidx + 6].i[7];
w2[0].s6 = pws[gidx + 6].i[8];
w2[1].s6 = pws[gidx + 6].i[9];
pw_len.s6 = pws[gidx + 6].pw_len;
digest[0].s6 = tmps[gidx + 6].digest_buf[0];
digest[1].s6 = tmps[gidx + 6].digest_buf[1];
digest[2].s6 = tmps[gidx + 6].digest_buf[2];
digest[3].s6 = tmps[gidx + 6].digest_buf[3];
}
if ((gidx + 7) < gid_max)
{
w0[0].s7 = pws[gidx + 7].i[0];
w0[1].s7 = pws[gidx + 7].i[1];
w0[2].s7 = pws[gidx + 7].i[2];
w0[3].s7 = pws[gidx + 7].i[3];
w1[0].s7 = pws[gidx + 7].i[4];
w1[1].s7 = pws[gidx + 7].i[5];
w1[2].s7 = pws[gidx + 7].i[6];
w1[3].s7 = pws[gidx + 7].i[7];
w2[0].s7 = pws[gidx + 7].i[8];
w2[1].s7 = pws[gidx + 7].i[9];
pw_len.s7 = pws[gidx + 7].pw_len;
digest[0].s7 = tmps[gidx + 7].digest_buf[0];
digest[1].s7 = tmps[gidx + 7].digest_buf[1];
digest[2].s7 = tmps[gidx + 7].digest_buf[2];
digest[3].s7 = tmps[gidx + 7].digest_buf[3];
}
#endif
#endif
@ -547,6 +634,42 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf
tmps[gidx + 3].digest_buf[3] = digest[3].s3;
}
#endif
#if VECT_SIZE >= 8
if ((gidx + 4) < gid_max)
{
tmps[gidx + 4].digest_buf[0] = digest[0].s4;
tmps[gidx + 4].digest_buf[1] = digest[1].s4;
tmps[gidx + 4].digest_buf[2] = digest[2].s4;
tmps[gidx + 4].digest_buf[3] = digest[3].s4;
}
if ((gidx + 5) < gid_max)
{
tmps[gidx + 5].digest_buf[0] = digest[0].s5;
tmps[gidx + 5].digest_buf[1] = digest[1].s5;
tmps[gidx + 5].digest_buf[2] = digest[2].s5;
tmps[gidx + 5].digest_buf[3] = digest[3].s5;
}
if ((gidx + 6) < gid_max)
{
tmps[gidx + 6].digest_buf[0] = digest[0].s6;
tmps[gidx + 6].digest_buf[1] = digest[1].s6;
tmps[gidx + 6].digest_buf[2] = digest[2].s6;
tmps[gidx + 6].digest_buf[3] = digest[3].s6;
}
if ((gidx + 7) < gid_max)
{
tmps[gidx + 7].digest_buf[0] = digest[0].s7;
tmps[gidx + 7].digest_buf[1] = digest[1].s7;
tmps[gidx + 7].digest_buf[2] = digest[2].s7;
tmps[gidx + 7].digest_buf[3] = digest[3].s7;
}
#endif
#endif
}

File diff suppressed because it is too large Load Diff

@ -137,6 +137,15 @@
#define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z)))
#endif
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \
e += K; \
e += x; \
e += f (b, c, d); \
e += rotl32_S (a, 5u); \
b = rotl32_S (b, 30u); \
}
#define SHA1_STEP(f,a,b,c,d,e,x) \
{ \
e += K; \

@ -29,8 +29,25 @@
#if VENDOR_ID == (1 << 0)
#define IS_AMD
#elif VENDOR_ID == (1 << 6)
//#define IS_GENERIC
#elif VENDOR_ID == (1 << 1)
#define IS_APPLE
#define IS_GENERIC
#elif VENDOR_ID == (1 << 2)
#define IS_INTEL_BEIGNET
#define IS_GENERIC
#elif VENDOR_ID == (1 << 3)
#define IS_INTEL_SDK
#define IS_GENERIC
#elif VENDOR_ID == (1 << 4)
#define IS_MESA
#define IS_GENERIC
#elif VENDOR_ID == (1 << 5)
#define IS_NV
//#define IS_GENERIC
#elif VENDOR_ID == (1 << 6)
#define IS_POCL
#define IS_GENERIC
#else
#define IS_GENERIC
#endif

@ -135,7 +135,7 @@ static inline int CPU_ISSET (int num, cpu_set_t *cs) { return (cs->count & (1 <
#define CL_VENDOR_AMD "Advanced Micro Devices, Inc."
#define CL_VENDOR_APPLE "Apple"
#define CL_VENDOR_INTEL_BEIGNET "Intel"
#define CL_VENDOR_INTEL_SDK "Intel(R) OpenCL"
#define CL_VENDOR_INTEL_SDK "Intel(R) Corporation"
#define CL_VENDOR_MESA "Mesa"
#define CL_VENDOR_NV "NVIDIA Corporation"
#define CL_VENDOR_POCL "The pocl project"
@ -144,10 +144,9 @@ static inline int CPU_ISSET (int num, cpu_set_t *cs) { return (cs->count & (1 <
#define VENDOR_ID_APPLE (1 << 1)
#define VENDOR_ID_INTEL_BEIGNET (1 << 2)
#define VENDOR_ID_INTEL_SDK (1 << 3)
#define VENDOR_ID INTEL (1 << 4)
#define VENDOR_ID_MESA (1 << 5)
#define VENDOR_ID_NV (1 << 6)
#define VENDOR_ID_POCL (1 << 7)
#define VENDOR_ID_MESA (1 << 4)
#define VENDOR_ID_NV (1 << 5)
#define VENDOR_ID_POCL (1 << 6)
#define VENDOR_ID_GENERIC (1 << 31)
#define BLOCK_SIZE 64

@ -8167,7 +8167,8 @@ int main (int argc, char **argv)
dgst_size = DGST_SIZE_4_4;
parse_func = wpa_parse_hash;
sort_by_digest = sort_by_digest_4_4;
opti_type = OPTI_TYPE_ZERO_BYTE;
opti_type = OPTI_TYPE_ZERO_BYTE
| OPTI_TYPE_SLOW_HASH_SIMD;
dgst_pos0 = 0;
dgst_pos1 = 1;
dgst_pos2 = 2;
@ -12684,19 +12685,19 @@ int main (int argc, char **argv)
}
else if (strcmp (platform_vendor, CL_VENDOR_APPLE) == 0)
{
vendor_id = VENDOR_ID_GENERIC;
vendor_id = VENDOR_ID_APPLE;
}
else if (strcmp (platform_vendor, CL_VENDOR_INTEL_BEIGNET) == 0)
{
vendor_id = VENDOR_ID_GENERIC;
vendor_id = VENDOR_ID_INTEL_BEIGNET;
}
else if (strcmp (platform_vendor, CL_VENDOR_INTEL_SDK) == 0)
{
vendor_id = VENDOR_ID_GENERIC;
vendor_id = VENDOR_ID_INTEL_SDK;
}
else if (strcmp (platform_vendor, CL_VENDOR_MESA) == 0)
{
vendor_id = VENDOR_ID_GENERIC;
vendor_id = VENDOR_ID_MESA;
}
else if (strcmp (platform_vendor, CL_VENDOR_NV) == 0)
{
@ -12704,7 +12705,7 @@ int main (int argc, char **argv)
}
else if (strcmp (platform_vendor, CL_VENDOR_POCL) == 0)
{
vendor_id = VENDOR_ID_GENERIC;
vendor_id = VENDOR_ID_POCL;
}
else
{
@ -14007,6 +14008,21 @@ int main (int argc, char **argv)
snprintf (build_opts, sizeof (build_opts) - 1, "-cl-std=CL1.1 -I\"%s/\" -DVENDOR_ID=%u -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u -DKERN_TYPE=%u -D_unroll", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type, kern_type);
if (device_param->vendor_id == VENDOR_ID_INTEL_SDK)
{
// we do vectorizing much better than the auto-vectorizer
char build_opts_new[1024] = { 0 };
snprintf (build_opts_new, sizeof (build_opts_new) - 1, "%s -cl-opt-disable", build_opts);
strncpy (build_opts, build_opts_new, sizeof (build_opts) - 1);
}
#ifdef DEBUG
log_info ("Device #%u: build_opts '%s'\n", device_id + 1, build_opts);
#endif
/**
* main kernel
*/

Loading…
Cancel
Save