From 9b3d18f87d2bf9d537240f0a664eabf111aad4f3 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 1 May 2016 18:34:59 +0200 Subject: [PATCH] SIMD for slow hashes prototype --- OpenCL/m00400.cl | 310 ++++++++++++++++++++++++++++----- OpenCL/types_ocl.c | 340 ------------------------------------- hashcat_tuning.hctab | 1 + include/common.h | 2 + include/kernel_functions.c | 41 +---- include/shared.h | 10 +- src/oclHashcat.c | 13 +- src/shared.c | 1 + 8 files changed, 301 insertions(+), 417 deletions(-) diff --git a/OpenCL/m00400.cl b/OpenCL/m00400.cl index c03f9612d..1eb365d10 100644 --- a/OpenCL/m00400.cl +++ b/OpenCL/m00400.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,12 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" #define COMPARE_S "OpenCL/check_single_comp4.c" #define COMPARE_M "OpenCL/check_multi_comp4.c" -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform_S (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) { u32 a = digest[0]; u32 b = digest[1]; @@ -44,6 +47,104 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co u32 we_t = w3[2]; u32 wf_t = 0; + MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); + MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); + MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); + MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); + MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + + MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); + MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); + MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); + MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); + MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + + MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); + MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); + MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); + MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); + MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + + MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); + MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); + MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); + MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); + MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + + digest[0] += a; + digest[1] += b; + digest[2] += c; + digest[3] += d; +} + +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) +{ + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = 0; + MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); @@ -194,7 +295,7 @@ __kernel void m00400_init (__global pw_t *pws, __global kernel_rule_t *rules_buf block3[2] = block_len * 8; block3[3] = 0; - append_0x80_4x4 (block0, block1, block2, block3, block_len); + append_0x80_4x4_S (block0, block1, block2, block3, block_len); /** * init @@ -207,7 +308,7 @@ __kernel void m00400_init (__global pw_t *pws, __global kernel_rule_t *rules_buf digest[2] = MD5M_C; digest[3] = MD5M_D; - md5_transform (block0, block1, block2, block3, digest); + md5_transform_S (block0, block1, block2, block3, digest); tmps[gid].digest_buf[0] = digest[0]; tmps[gid].digest_buf[1] = digest[1]; @@ -225,75 +326,160 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf if (gid >= gid_max) return; - u32 w0[4]; + u32x w0[4] = { 0 }; + u32x w1[4] = { 0 }; + u32x w2[4] = { 0 }; - w0[0] = pws[gid].i[ 0]; - w0[1] = pws[gid].i[ 1]; - w0[2] = pws[gid].i[ 2]; - w0[3] = pws[gid].i[ 3]; + u32x pw_len = 0; - u32 w1[4]; + u32x digest[4] = { 0 }; - w1[0] = pws[gid].i[ 4]; - w1[1] = pws[gid].i[ 5]; - w1[2] = pws[gid].i[ 6]; - w1[3] = pws[gid].i[ 7]; + #if VECT_SIZE == 1 - u32 w2[4]; + w0[0] = pws[gid].i[0]; + w0[1] = pws[gid].i[1]; + w0[2] = pws[gid].i[2]; + w0[3] = pws[gid].i[3]; + w1[0] = pws[gid].i[4]; + w1[1] = pws[gid].i[5]; + w1[2] = pws[gid].i[6]; + w1[3] = pws[gid].i[7]; + w2[0] = pws[gid].i[8]; + w2[1] = pws[gid].i[9]; - w2[0] = pws[gid].i[ 8]; - w2[1] = pws[gid].i[ 9]; - w2[2] = 0; - w2[3] = 0; - - const u32 pw_len = pws[gid].pw_len; - - /** - * digest - */ - - u32 digest[4]; + pw_len = pws[gid].pw_len; digest[0] = tmps[gid].digest_buf[0]; digest[1] = tmps[gid].digest_buf[1]; digest[2] = tmps[gid].digest_buf[2]; digest[3] = tmps[gid].digest_buf[3]; + #else + + const u32 gidx = gid * VECT_SIZE; + + #if VECT_SIZE >= 2 + + if ((gidx + 0) < gid_max) + { + w0[0].s0 = pws[gidx + 0].i[0]; + w0[1].s0 = pws[gidx + 0].i[1]; + w0[2].s0 = pws[gidx + 0].i[2]; + w0[3].s0 = pws[gidx + 0].i[3]; + w1[0].s0 = pws[gidx + 0].i[4]; + w1[1].s0 = pws[gidx + 0].i[5]; + w1[2].s0 = pws[gidx + 0].i[6]; + w1[3].s0 = pws[gidx + 0].i[7]; + w2[0].s0 = pws[gidx + 0].i[8]; + w2[1].s0 = pws[gidx + 0].i[9]; + + pw_len.s0 = pws[gidx + 0].pw_len; + + digest[0].s0 = tmps[gidx + 0].digest_buf[0]; + digest[1].s0 = tmps[gidx + 0].digest_buf[1]; + digest[2].s0 = tmps[gidx + 0].digest_buf[2]; + digest[3].s0 = tmps[gidx + 0].digest_buf[3]; + } + + if ((gidx + 1) < gid_max) + { + w0[0].s1 = pws[gidx + 1].i[0]; + w0[1].s1 = pws[gidx + 1].i[1]; + w0[2].s1 = pws[gidx + 1].i[2]; + w0[3].s1 = pws[gidx + 1].i[3]; + w1[0].s1 = pws[gidx + 1].i[4]; + w1[1].s1 = pws[gidx + 1].i[5]; + w1[2].s1 = pws[gidx + 1].i[6]; + w1[3].s1 = pws[gidx + 1].i[7]; + w2[0].s1 = pws[gidx + 1].i[8]; + w2[1].s1 = pws[gidx + 1].i[9]; + + pw_len.s1 = pws[gidx + 1].pw_len; + + digest[0].s1 = tmps[gidx + 1].digest_buf[0]; + digest[1].s1 = tmps[gidx + 1].digest_buf[1]; + digest[2].s1 = tmps[gidx + 1].digest_buf[2]; + digest[3].s1 = tmps[gidx + 1].digest_buf[3]; + } + + #endif + + #if VECT_SIZE >= 4 + + if ((gidx + 2) < gid_max) + { + w0[0].s2 = pws[gidx + 2].i[0]; + w0[1].s2 = pws[gidx + 2].i[1]; + w0[2].s2 = pws[gidx + 2].i[2]; + w0[3].s2 = pws[gidx + 2].i[3]; + w1[0].s2 = pws[gidx + 2].i[4]; + w1[1].s2 = pws[gidx + 2].i[5]; + w1[2].s2 = pws[gidx + 2].i[6]; + w1[3].s2 = pws[gidx + 2].i[7]; + w2[0].s2 = pws[gidx + 2].i[8]; + w2[1].s2 = pws[gidx + 2].i[9]; + + pw_len.s2 = pws[gidx + 2].pw_len; + + digest[0].s2 = tmps[gidx + 2].digest_buf[0]; + digest[1].s2 = tmps[gidx + 2].digest_buf[1]; + digest[2].s2 = tmps[gidx + 2].digest_buf[2]; + digest[3].s2 = tmps[gidx + 2].digest_buf[3]; + } + + if ((gidx + 3) < gid_max) + { + w0[0].s3 = pws[gidx + 3].i[0]; + w0[1].s3 = pws[gidx + 3].i[1]; + w0[2].s3 = pws[gidx + 3].i[2]; + w0[3].s3 = pws[gidx + 3].i[3]; + w1[0].s3 = pws[gidx + 3].i[4]; + w1[1].s3 = pws[gidx + 3].i[5]; + w1[2].s3 = pws[gidx + 3].i[6]; + w1[3].s3 = pws[gidx + 3].i[7]; + w2[0].s3 = pws[gidx + 3].i[8]; + w2[1].s3 = pws[gidx + 3].i[9]; + + pw_len.s3 = pws[gidx + 3].pw_len; + + digest[0].s3 = tmps[gidx + 3].digest_buf[0]; + digest[1].s3 = tmps[gidx + 3].digest_buf[1]; + digest[2].s3 = tmps[gidx + 3].digest_buf[2]; + digest[3].s3 = tmps[gidx + 3].digest_buf[3]; + } + + #endif + #endif + /** * loop */ - u32 block_len = (16 + pw_len); + u32x block_len = (16 + pw_len); - u32 block0[4]; + u32x block0[4]; + u32x block1[4]; + u32x block2[4]; + u32x block3[4]; block0[0] = 0; block0[1] = 0; block0[2] = 0; block0[3] = 0; - - u32 block1[4]; - block1[0] = w0[0]; block1[1] = w0[1]; block1[2] = w0[2]; block1[3] = w0[3]; - - u32 block2[4]; - block2[0] = w1[0]; block2[1] = w1[1]; block2[2] = w1[2]; block2[3] = w1[3]; - - u32 block3[4]; - block3[0] = w2[0]; block3[1] = w2[1]; block3[2] = block_len * 8; block3[3] = 0; - append_0x80_4x4 (block0, block1, block2, block3, block_len); + append_0x80_4x4_VV (block0, block1, block2, block3, block_len); /** * init @@ -314,10 +500,55 @@ __kernel void m00400_loop (__global pw_t *pws, __global kernel_rule_t *rules_buf md5_transform (block0, block1, block2, block3, digest); } + #if VECT_SIZE == 1 + tmps[gid].digest_buf[0] = digest[0]; tmps[gid].digest_buf[1] = digest[1]; tmps[gid].digest_buf[2] = digest[2]; tmps[gid].digest_buf[3] = digest[3]; + + #else + + #if VECT_SIZE >= 2 + + if ((gidx + 0) < gid_max) + { + tmps[gidx + 0].digest_buf[0] = digest[0].s0; + tmps[gidx + 0].digest_buf[1] = digest[1].s0; + tmps[gidx + 0].digest_buf[2] = digest[2].s0; + tmps[gidx + 0].digest_buf[3] = digest[3].s0; + } + + if ((gidx + 1) < gid_max) + { + tmps[gidx + 1].digest_buf[0] = digest[0].s1; + tmps[gidx + 1].digest_buf[1] = digest[1].s1; + tmps[gidx + 1].digest_buf[2] = digest[2].s1; + tmps[gidx + 1].digest_buf[3] = digest[3].s1; + } + + #endif + + #if VECT_SIZE >= 4 + + if ((gidx + 2) < gid_max) + { + tmps[gidx + 2].digest_buf[0] = digest[0].s2; + tmps[gidx + 2].digest_buf[1] = digest[1].s2; + tmps[gidx + 2].digest_buf[2] = digest[2].s2; + tmps[gidx + 2].digest_buf[3] = digest[3].s2; + } + + if ((gidx + 3) < gid_max) + { + tmps[gidx + 3].digest_buf[0] = digest[0].s3; + tmps[gidx + 3].digest_buf[1] = digest[1].s3; + tmps[gidx + 3].digest_buf[2] = digest[2].s3; + tmps[gidx + 3].digest_buf[3] = digest[3].s3; + } + + #endif + #endif } __kernel void m00400_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) @@ -327,11 +558,10 @@ __kernel void m00400_comp (__global pw_t *pws, __global kernel_rule_t *rules_buf */ const u32 gid = get_global_id (0); + const u32 lid = get_local_id (0); if (gid >= gid_max) return; - const u32 lid = get_local_id (0); - /** * digest */ diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index b2e9b8618..68d40de76 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -351,71 +351,6 @@ static inline u64 rotl64_S (const u64 a, const u32 n) return rotr64_S (a, 64 - n); } -#if CUDA_ARCH >= 500 -static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} - -static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c) -{ - u32 r; - - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - - return r; -} -#endif - static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -850,281 +785,6 @@ static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c) } #endif -#if CUDA_ARCH >= 500 -static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_39 (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_59 (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_96 (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_e4 (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_e8 (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c) -{ - u32x r; - - #if VECT_SIZE == 1 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); - #endif - - #if VECT_SIZE >= 2 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - #endif - - #if VECT_SIZE >= 4 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); - #endif - - #if VECT_SIZE >= 8 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s7) : "r" (a.s7), "r" (b.s7), "r" (c.s7)); - #endif - - #if VECT_SIZE >= 16 - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s8) : "r" (a.s8), "r" (b.s8), "r" (c.s8)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.s9) : "r" (a.s9), "r" (b.s9), "r" (c.s9)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sa) : "r" (a.sa), "r" (b.sa), "r" (c.sa)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sb) : "r" (a.sb), "r" (b.sb), "r" (c.sb)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sc) : "r" (a.sc), "r" (b.sc), "r" (c.sc)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sd) : "r" (a.sd), "r" (b.sd), "r" (c.sd)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.se) : "r" (a.se), "r" (b.se), "r" (c.se)); - asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r.sf) : "r" (a.sf), "r" (b.sf), "r" (c.sf)); - #endif - - return r; -} - -#endif #endif #ifdef IS_GENERIC diff --git a/hashcat_tuning.hctab b/hashcat_tuning.hctab index 87f7fd10a..43391da80 100644 --- a/hashcat_tuning.hctab +++ b/hashcat_tuning.hctab @@ -71,6 +71,7 @@ ALIAS_nv_budget * 21 2 A ALIAS_nv_budget * 22 2 A A ALIAS_nv_budget * 23 2 A A ALIAS_nv_budget * 200 2 A A +ALIAS_nv_budget * 400 2 A A ALIAS_nv_budget * 900 2 A A ALIAS_nv_budget * 1000 2 A A ALIAS_nv_budget * 1100 2 A A diff --git a/include/common.h b/include/common.h index 278f63ad4..18e2f0b16 100644 --- a/include/common.h +++ b/include/common.h @@ -123,4 +123,6 @@ void log_error (const char *fmt, ...); #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define CEIL(a) ((a - (int) (a)) > 0 ? a + 1 : a) + #endif // COMMON_H diff --git a/include/kernel_functions.c b/include/kernel_functions.c index 1d1048f27..d3326fb5c 100644 --- a/include/kernel_functions.c +++ b/include/kernel_functions.c @@ -10,15 +10,9 @@ #define MD4_H_S(x,y,z) ((x) ^ (y) ^ (z)) #ifdef IS_NV -#if CUDA_ARCH >= 500 -#define MD4_F(x,y,z) lut3_ca ((x), (y), (z)) -#define MD4_G(x,y,z) lut3_e8 ((x), (y), (z)) -#define MD4_H(x,y,z) lut3_96 ((x), (y), (z)) -#else #define MD4_F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) #define MD4_G(x,y,z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) #define MD4_H(x,y,z) ((x) ^ (y) ^ (z)) -#endif #define MD4_Fo(x,y,z) (MD4_F((x), (y), (z))) #define MD4_Go(x,y,z) (MD4_G((x), (y), (z))) #endif @@ -64,17 +58,10 @@ #define MD5_I_S(x,y,z) ((y) ^ ((x) | ~(z))) #ifdef IS_NV -#if CUDA_ARCH >= 500 -#define MD5_F(x,y,z) lut3_ca ((x), (y), (z)) -#define MD5_G(x,y,z) lut3_e4 ((x), (y), (z)) -#define MD5_H(x,y,z) lut3_96 ((x), (y), (z)) -#define MD5_I(x,y,z) lut3_39 ((x), (y), (z)) -#else #define MD5_F(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) #define MD5_G(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) #define MD5_H(x,y,z) ((x) ^ (y) ^ (z)) #define MD5_I(x,y,z) ((y) ^ ((x) | ~(z))) -#endif #define MD5_Fo(x,y,z) (MD5_F((x), (y), (z))) #define MD5_Go(x,y,z) (MD5_G((x), (y), (z))) #endif @@ -97,6 +84,15 @@ #define MD5_Go(x,y,z) (MD5_G((x), (y), (z))) #endif +#define MD5_STEP_S(f,a,b,c,d,x,K,s) \ +{ \ + a += K; \ + a += x; \ + a += f (b, c, d); \ + a = rotl32_S (a, s); \ + a += b; \ +} + #define MD5_STEP(f,a,b,c,d,x,K,s) \ { \ a += K; \ @@ -118,15 +114,9 @@ #if defined _SHA1_ || defined _SAPG_ || defined _OFFICE2007_ || defined _OFFICE2010_ || defined _OLDOFFICE34_ || defined _ANDROIDFDE_ || defined _DCC2_ || defined _WPA_ || defined _MD5_SHA1_ || defined _SHA1_MD5_ || defined _PSAFE2_ || defined _LOTUS8_ || defined _PBKDF2_SHA1_ || defined _RAR3_ || defined _SHA256_SHA1_ #ifdef IS_NV -#if CUDA_ARCH >= 500 -#define SHA1_F0(x,y,z) lut3_ca ((x), (y), (z)) -#define SHA1_F1(x,y,z) lut3_96 ((x), (y), (z)) -#define SHA1_F2(x,y,z) lut3_e8 ((x), (y), (z)) -#else #define SHA1_F0(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) #define SHA1_F1(x,y,z) ((x) ^ (y) ^ (z)) #define SHA1_F2(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y)))) -#endif #define SHA1_F0o(x,y,z) (SHA1_F0 ((x), (y), (z))) #define SHA1_F2o(x,y,z) (SHA1_F2 ((x), (y), (z))) #endif @@ -196,13 +186,8 @@ #define SHA256_S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u)) #ifdef IS_NV -#if CUDA_ARCH >= 500 -#define SHA256_F0(x,y,z) lut3_e8 ((x), (y), (z)) -#define SHA256_F1(x,y,z) lut3_ca ((x), (y), (z)) -#else #define SHA256_F0(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y)))) #define SHA256_F1(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) -#endif #define SHA256_F0o(x,y,z) (SHA256_F0 ((x), (y), (z))) #define SHA256_F1o(x,y,z) (SHA256_F1 ((x), (y), (z))) #endif @@ -321,19 +306,11 @@ #ifdef _RIPEMD160_ #ifdef IS_NV -#if CUDA_ARCH >= 500 -#define RIPEMD160_F(x,y,z) lut3_96 ((x), (y), (z)) -#define RIPEMD160_G(x,y,z) lut3_ca ((x), (y), (z)) -#define RIPEMD160_H(x,y,z) lut3_59 ((x), (y), (z)) -#define RIPEMD160_I(x,y,z) lut3_e4 ((x), (y), (z)) -#define RIPEMD160_J(x,y,z) lut3_2d ((x), (y), (z)) -#else #define RIPEMD160_F(x,y,z) ((x) ^ (y) ^ (z)) #define RIPEMD160_G(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) /* x ? y : z */ #define RIPEMD160_H(x,y,z) (((x) | ~(y)) ^ (z)) #define RIPEMD160_I(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) /* z ? x : y */ #define RIPEMD160_J(x,y,z) ((x) ^ ((y) | ~(z))) -#endif #define RIPEMD160_Go(x,y,z) (RIPEMD160_G ((x), (y), (z))) #define RIPEMD160_Io(x,y,z) (RIPEMD160_I ((x), (y), (z))) #endif diff --git a/include/shared.h b/include/shared.h index 1038ba9eb..f3666b75a 100644 --- a/include/shared.h +++ b/include/shared.h @@ -1121,10 +1121,11 @@ extern hc_thread_mutex_t mux_display; #define OPTI_TYPE_SINGLE_SALT (1 << 12) #define OPTI_TYPE_BRUTE_FORCE (1 << 13) #define OPTI_TYPE_RAW_HASH (1 << 14) -#define OPTI_TYPE_USES_BITS_8 (1 << 15) -#define OPTI_TYPE_USES_BITS_16 (1 << 16) -#define OPTI_TYPE_USES_BITS_32 (1 << 17) -#define OPTI_TYPE_USES_BITS_64 (1 << 18) +#define OPTI_TYPE_SLOW_HASH_SIMD (1 << 15) +#define OPTI_TYPE_USES_BITS_8 (1 << 16) +#define OPTI_TYPE_USES_BITS_16 (1 << 17) +#define OPTI_TYPE_USES_BITS_32 (1 << 18) +#define OPTI_TYPE_USES_BITS_64 (1 << 19) #define OPTI_STR_ZERO_BYTE "Zero-Byte" #define OPTI_STR_PRECOMPUTE_INIT "Precompute-Init" @@ -1140,6 +1141,7 @@ extern hc_thread_mutex_t mux_display; #define OPTI_STR_SINGLE_SALT "Single-Salt" #define OPTI_STR_BRUTE_FORCE "Brute-Force" #define OPTI_STR_RAW_HASH "Raw-Hash" +#define OPTI_STR_SLOW_HASH_SIMD "Slow-Hash-SIMD" #define OPTI_STR_USES_BITS_8 "Uses-8-Bit" #define OPTI_STR_USES_BITS_16 "Uses-16-Bit" #define OPTI_STR_USES_BITS_32 "Uses-32-Bit" diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 4b1ec0845..913e514be 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -2424,8 +2424,18 @@ static void run_kernel (const uint kern_run, hc_device_param_t *device_param, co hc_clGetKernelWorkGroupInfo (data.ocl, kernel, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &workgroup_size, NULL); + if (kern_run == KERN_RUN_2) + { + if (data.opti_type & OPTI_TYPE_SLOW_HASH_SIMD) + { + num_elements = CEIL ((float) num_elements / device_param->vector_width); + } + } + if (kernel_threads > workgroup_size) kernel_threads = workgroup_size; + while (num_elements % kernel_threads) num_elements++; + const size_t global_work_size[3] = { num_elements, 1, 1 }; const size_t local_work_size[3] = { kernel_threads, 1, 1 }; @@ -7611,7 +7621,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_4_4; parse_func = phpass_parse_hash; sort_by_digest = sort_by_digest_4_4; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_SLOW_HASH_SIMD; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; diff --git a/src/shared.c b/src/shared.c index 3aca24369..fdea5919c 100644 --- a/src/shared.c +++ b/src/shared.c @@ -5632,6 +5632,7 @@ char *stroptitype (const uint opti_type) case OPTI_TYPE_SINGLE_SALT: return ((char *) OPTI_STR_SINGLE_SALT); break; case OPTI_TYPE_BRUTE_FORCE: return ((char *) OPTI_STR_BRUTE_FORCE); break; case OPTI_TYPE_RAW_HASH: return ((char *) OPTI_STR_RAW_HASH); break; + case OPTI_TYPE_SLOW_HASH_SIMD: return ((char *) OPTI_STR_SLOW_HASH_SIMD); break; case OPTI_TYPE_USES_BITS_8: return ((char *) OPTI_STR_USES_BITS_8); break; case OPTI_TYPE_USES_BITS_16: return ((char *) OPTI_STR_USES_BITS_16); break; case OPTI_TYPE_USES_BITS_32: return ((char *) OPTI_STR_USES_BITS_32); break;