You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hashcat/OpenCL/m00400-pure.cl

190 lines
3.6 KiB

9 years ago
/**
* Author......: See docs/credits.txt
9 years ago
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include M2S(INCLUDE_PATH/inc_vendor.h)
#include M2S(INCLUDE_PATH/inc_types.h)
#include M2S(INCLUDE_PATH/inc_platform.cl)
#include M2S(INCLUDE_PATH/inc_common.cl)
#include M2S(INCLUDE_PATH/inc_simd.cl)
#include M2S(INCLUDE_PATH/inc_hash_md5.cl)
#endif
9 years ago
#define COMPARE_S M2S(INCLUDE_PATH/inc_comp_single.cl)
#define COMPARE_M M2S(INCLUDE_PATH/inc_comp_multi.cl)
9 years ago
typedef struct phpass_tmp
{
u32 digest_buf[4];
} phpass_tmp_t;
KERNEL_FQ void m00400_init (KERN_ATTR_TMPS (phpass_tmp_t))
9 years ago
{
/**
* base
*/
const u64 gid = get_global_id (0);
9 years ago
if (gid >= GID_CNT) return;
9 years ago
/**
* init
*/
md5_ctx_t md5_ctx;
9 years ago
md5_init (&md5_ctx);
9 years ago
md5_update_global (&md5_ctx, salt_bufs[SALT_POS_HOST].salt_buf, salt_bufs[SALT_POS_HOST].salt_len);
9 years ago
md5_update_global (&md5_ctx, pws[gid].i, pws[gid].pw_len);
9 years ago
md5_final (&md5_ctx);
9 years ago
u32 digest[4];
9 years ago
digest[0] = md5_ctx.h[0];
digest[1] = md5_ctx.h[1];
digest[2] = md5_ctx.h[2];
digest[3] = md5_ctx.h[3];
9 years ago
tmps[gid].digest_buf[0] = digest[0];
tmps[gid].digest_buf[1] = digest[1];
tmps[gid].digest_buf[2] = digest[2];
tmps[gid].digest_buf[3] = digest[3];
}
KERNEL_FQ void m00400_loop (KERN_ATTR_TMPS (phpass_tmp_t))
9 years ago
{
/**
* base
*/
const u64 gid = get_global_id (0);
9 years ago
if (gid >= GID_CNT) return;
/**
* init
*/
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
const u32 pw_len = pws[gid].pw_len;
9 years ago
u32 w[64] = { 0 };
for (u32 i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
{
w[idx] = pws[gid].i[idx];
}
u32 digest[4];
digest[0] = tmps[gid].digest_buf[0];
digest[1] = tmps[gid].digest_buf[1];
digest[2] = tmps[gid].digest_buf[2];
digest[3] = tmps[gid].digest_buf[3];
9 years ago
/**
* loop
*/
md5_ctx_t md5_ctx;
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
md5_init (&md5_ctx);
md5_ctx.w0[0] = digest[0];
md5_ctx.w0[1] = digest[1];
md5_ctx.w0[2] = digest[2];
md5_ctx.w0[3] = digest[3];
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
md5_ctx.len = 16;
md5_update (&md5_ctx, w, pw_len);
md5_final (&md5_ctx);
digest[0] = md5_ctx.h[0];
digest[1] = md5_ctx.h[1];
digest[2] = md5_ctx.h[2];
digest[3] = md5_ctx.h[3];
if ((16 + pw_len + 1) >= 56)
9 years ago
{
for (u32 i = 1; i < LOOP_CNT; i++)
{
md5_init (&md5_ctx);
md5_ctx.w0[0] = digest[0];
md5_ctx.w0[1] = digest[1];
md5_ctx.w0[2] = digest[2];
md5_ctx.w0[3] = digest[3];
md5_ctx.len = 16;
md5_update (&md5_ctx, w, pw_len);
9 years ago
md5_final (&md5_ctx);
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
digest[0] = md5_ctx.h[0];
digest[1] = md5_ctx.h[1];
digest[2] = md5_ctx.h[2];
digest[3] = md5_ctx.h[3];
}
}
else
{
for (u32 i = 1; i < LOOP_CNT; i++)
{
md5_ctx.w0[0] = digest[0];
md5_ctx.w0[1] = digest[1];
md5_ctx.w0[2] = digest[2];
md5_ctx.w0[3] = digest[3];
digest[0] = MD5M_A;
digest[1] = MD5M_B;
digest[2] = MD5M_C;
digest[3] = MD5M_D;
md5_transform (md5_ctx.w0, md5_ctx.w1, md5_ctx.w2, md5_ctx.w3, digest);
}
}
tmps[gid].digest_buf[0] = digest[0];
tmps[gid].digest_buf[1] = digest[1];
tmps[gid].digest_buf[2] = digest[2];
tmps[gid].digest_buf[3] = digest[3];
9 years ago
}
KERNEL_FQ void m00400_comp (KERN_ATTR_TMPS (phpass_tmp_t))
9 years ago
{
/**
* modifier
*/
const u64 gid = get_global_id (0);
const u64 lid = get_local_id (0);
9 years ago
if (gid >= GID_CNT) return;
9 years ago
/**
* digest
*/
const u32 r0 = tmps[gid].digest_buf[DGST_R0];
const u32 r1 = tmps[gid].digest_buf[DGST_R1];
const u32 r2 = tmps[gid].digest_buf[DGST_R2];
const u32 r3 = tmps[gid].digest_buf[DGST_R3];
9 years ago
#define il_pos 0
#ifdef KERNEL_STATIC
#include COMPARE_M
#endif
}