2015-12-04 14:47:52 +00:00
|
|
|
/**
|
2016-09-11 20:20:15 +00:00
|
|
|
* Author......: See docs/credits.txt
|
2015-12-04 14:47:52 +00:00
|
|
|
* License.....: MIT
|
|
|
|
*/
|
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
//#define NEW_SIMD_CODE
|
2016-05-01 16:34:59 +00:00
|
|
|
|
2016-05-25 21:04:26 +00:00
|
|
|
#include "inc_vendor.cl"
|
2016-06-26 21:39:42 +00:00
|
|
|
#include "inc_hash_constants.h"
|
2016-05-25 21:04:26 +00:00
|
|
|
#include "inc_hash_functions.cl"
|
|
|
|
#include "inc_types.cl"
|
|
|
|
#include "inc_common.cl"
|
|
|
|
#include "inc_simd.cl"
|
2017-07-18 11:23:42 +00:00
|
|
|
#include "inc_hash_md5.cl"
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2016-05-25 21:04:26 +00:00
|
|
|
#define COMPARE_S "inc_comp_single.cl"
|
|
|
|
#define COMPARE_M "inc_comp_multi.cl"
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-08-25 15:52:55 +00:00
|
|
|
__kernel void m00400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max)
|
2015-12-04 14:47:52 +00:00
|
|
|
{
|
|
|
|
/**
|
|
|
|
* base
|
|
|
|
*/
|
|
|
|
|
2017-08-19 14:39:22 +00:00
|
|
|
const u64 gid = get_global_id (0);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
|
|
|
if (gid >= gid_max) return;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* init
|
|
|
|
*/
|
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_ctx_t md5_ctx;
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_init (&md5_ctx);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_update_global (&md5_ctx, salt_bufs[salt_pos].salt_buf, salt_bufs[salt_pos].salt_len);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_update_global (&md5_ctx, pws[gid].i, pws[gid].pw_len);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_final (&md5_ctx);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2015-12-15 11:04:22 +00:00
|
|
|
u32 digest[4];
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
digest[0] = md5_ctx.h[0];
|
|
|
|
digest[1] = md5_ctx.h[1];
|
|
|
|
digest[2] = md5_ctx.h[2];
|
|
|
|
digest[3] = md5_ctx.h[3];
|
2015-12-04 14:47:52 +00:00
|
|
|
|
|
|
|
tmps[gid].digest_buf[0] = digest[0];
|
|
|
|
tmps[gid].digest_buf[1] = digest[1];
|
|
|
|
tmps[gid].digest_buf[2] = digest[2];
|
|
|
|
tmps[gid].digest_buf[3] = digest[3];
|
|
|
|
}
|
|
|
|
|
2017-08-25 15:52:55 +00:00
|
|
|
__kernel void m00400_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max)
|
2015-12-04 14:47:52 +00:00
|
|
|
{
|
|
|
|
/**
|
|
|
|
* base
|
|
|
|
*/
|
|
|
|
|
2017-08-19 14:39:22 +00:00
|
|
|
const u64 gid = get_global_id (0);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
if (gid >= gid_max) return;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* init
|
|
|
|
*/
|
Converted -m 400 to password length 256 support
Something weird happend here, read on!
I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop.
That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected.
So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the
performance drop is.
On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop.
If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable
comparison in case we just want to rate the drop that is actually caused by the code change itself.
The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to
7134kH/s!!
When I gave it a second thought, it turned out that:
1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did
2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using
the pw_len information
3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti
Note that this kernel is now branched into password length < 40 and larger.
There's a large drop on performance where SIMD is really important, for example CPU.
We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
2017-06-22 11:49:15 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
const u32 pw_len = pws[gid].pw_len;
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
u32 w[64] = { 0 };
|
2016-05-14 17:45:51 +00:00
|
|
|
|
2017-08-17 11:43:35 +00:00
|
|
|
for (int i = 0, idx = 0; i < pw_len; i += 4, idx += 1)
|
2017-07-18 11:23:42 +00:00
|
|
|
{
|
|
|
|
w[idx] = pws[gid].i[idx];
|
|
|
|
}
|
|
|
|
|
|
|
|
u32 digest[4];
|
|
|
|
|
|
|
|
digest[0] = tmps[gid].digest_buf[0];
|
|
|
|
digest[1] = tmps[gid].digest_buf[1];
|
|
|
|
digest[2] = tmps[gid].digest_buf[2];
|
|
|
|
digest[3] = tmps[gid].digest_buf[3];
|
2016-05-01 16:34:59 +00:00
|
|
|
|
2015-12-04 14:47:52 +00:00
|
|
|
/**
|
|
|
|
* loop
|
|
|
|
*/
|
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_ctx_t md5_ctx;
|
Converted -m 400 to password length 256 support
Something weird happend here, read on!
I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop.
That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected.
So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the
performance drop is.
On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop.
If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable
comparison in case we just want to rate the drop that is actually caused by the code change itself.
The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to
7134kH/s!!
When I gave it a second thought, it turned out that:
1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did
2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using
the pw_len information
3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti
Note that this kernel is now branched into password length < 40 and larger.
There's a large drop on performance where SIMD is really important, for example CPU.
We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
2017-06-22 11:49:15 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_init (&md5_ctx);
|
|
|
|
|
|
|
|
md5_ctx.w0[0] = digest[0];
|
|
|
|
md5_ctx.w0[1] = digest[1];
|
|
|
|
md5_ctx.w0[2] = digest[2];
|
|
|
|
md5_ctx.w0[3] = digest[3];
|
Converted -m 400 to password length 256 support
Something weird happend here, read on!
I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop.
That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected.
So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the
performance drop is.
On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop.
If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable
comparison in case we just want to rate the drop that is actually caused by the code change itself.
The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to
7134kH/s!!
When I gave it a second thought, it turned out that:
1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did
2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using
the pw_len information
3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti
Note that this kernel is now branched into password length < 40 and larger.
There's a large drop on performance where SIMD is really important, for example CPU.
We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
2017-06-22 11:49:15 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_ctx.len = 16;
|
|
|
|
|
|
|
|
md5_update (&md5_ctx, w, pw_len);
|
|
|
|
|
|
|
|
md5_final (&md5_ctx);
|
|
|
|
|
|
|
|
digest[0] = md5_ctx.h[0];
|
|
|
|
digest[1] = md5_ctx.h[1];
|
|
|
|
digest[2] = md5_ctx.h[2];
|
|
|
|
digest[3] = md5_ctx.h[3];
|
|
|
|
|
|
|
|
if ((16 + pw_len + 1) >= 56)
|
2015-12-04 14:47:52 +00:00
|
|
|
{
|
2017-07-18 11:23:42 +00:00
|
|
|
for (u32 i = 1; i < loop_cnt; i++)
|
|
|
|
{
|
|
|
|
md5_init (&md5_ctx);
|
|
|
|
|
|
|
|
md5_ctx.w0[0] = digest[0];
|
|
|
|
md5_ctx.w0[1] = digest[1];
|
|
|
|
md5_ctx.w0[2] = digest[2];
|
|
|
|
md5_ctx.w0[3] = digest[3];
|
|
|
|
|
|
|
|
md5_ctx.len = 16;
|
|
|
|
|
|
|
|
md5_update (&md5_ctx, w, pw_len);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
md5_final (&md5_ctx);
|
Converted -m 400 to password length 256 support
Something weird happend here, read on!
I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop.
That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected.
So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the
performance drop is.
On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop.
If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable
comparison in case we just want to rate the drop that is actually caused by the code change itself.
The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to
7134kH/s!!
When I gave it a second thought, it turned out that:
1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did
2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using
the pw_len information
3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti
Note that this kernel is now branched into password length < 40 and larger.
There's a large drop on performance where SIMD is really important, for example CPU.
We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
2017-06-22 11:49:15 +00:00
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
digest[0] = md5_ctx.h[0];
|
|
|
|
digest[1] = md5_ctx.h[1];
|
|
|
|
digest[2] = md5_ctx.h[2];
|
|
|
|
digest[3] = md5_ctx.h[3];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (u32 i = 1; i < loop_cnt; i++)
|
|
|
|
{
|
|
|
|
md5_ctx.w0[0] = digest[0];
|
|
|
|
md5_ctx.w0[1] = digest[1];
|
|
|
|
md5_ctx.w0[2] = digest[2];
|
|
|
|
md5_ctx.w0[3] = digest[3];
|
|
|
|
|
|
|
|
digest[0] = MD5M_A;
|
|
|
|
digest[1] = MD5M_B;
|
|
|
|
digest[2] = MD5M_C;
|
|
|
|
digest[3] = MD5M_D;
|
|
|
|
|
|
|
|
md5_transform (md5_ctx.w0, md5_ctx.w1, md5_ctx.w2, md5_ctx.w3, digest);
|
|
|
|
}
|
2016-05-14 17:45:51 +00:00
|
|
|
}
|
|
|
|
|
2017-07-18 11:23:42 +00:00
|
|
|
tmps[gid].digest_buf[0] = digest[0];
|
|
|
|
tmps[gid].digest_buf[1] = digest[1];
|
|
|
|
tmps[gid].digest_buf[2] = digest[2];
|
|
|
|
tmps[gid].digest_buf[3] = digest[3];
|
2015-12-04 14:47:52 +00:00
|
|
|
}
|
|
|
|
|
2017-08-25 15:52:55 +00:00
|
|
|
__kernel void m00400_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global phpass_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u64 gid_max)
|
2015-12-04 14:47:52 +00:00
|
|
|
{
|
|
|
|
/**
|
|
|
|
* modifier
|
|
|
|
*/
|
|
|
|
|
2017-08-19 14:39:22 +00:00
|
|
|
const u64 gid = get_global_id (0);
|
|
|
|
const u64 lid = get_local_id (0);
|
2015-12-04 14:47:52 +00:00
|
|
|
|
|
|
|
if (gid >= gid_max) return;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* digest
|
|
|
|
*/
|
|
|
|
|
2015-12-15 11:04:22 +00:00
|
|
|
const u32 r0 = tmps[gid].digest_buf[DGST_R0];
|
|
|
|
const u32 r1 = tmps[gid].digest_buf[DGST_R1];
|
|
|
|
const u32 r2 = tmps[gid].digest_buf[DGST_R2];
|
|
|
|
const u32 r3 = tmps[gid].digest_buf[DGST_R3];
|
2015-12-04 14:47:52 +00:00
|
|
|
|
|
|
|
#define il_pos 0
|
|
|
|
|
2015-12-15 11:04:22 +00:00
|
|
|
#include COMPARE_M
|
2017-07-18 11:23:42 +00:00
|
|
|
}
|