You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hashcat/OpenCL/inc_hash_md5.cl

1646 lines
38 KiB

Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
// important notes on this:
// input buf unused bytes needs to be set to zero
// input buf needs to be in algorithm native byte order (md5 = LE, sha1 = BE, etc)
// input buf needs to be 64 byte aligned when using md5_update()
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
typedef struct md5_ctx
{
u32 h[4];
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int len;
} md5_ctx_t;
void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{
u32 a = digest[0];
u32 b = digest[1];
u32 c = digest[2];
u32 d = digest[3];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = w3[3];
MD5_STEP_S (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP_S (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP_S (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
MD5_STEP_S (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
MD5_STEP_S (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
MD5_STEP_S (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
MD5_STEP_S (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
MD5_STEP_S (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
MD5_STEP_S (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
MD5_STEP_S (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
MD5_STEP_S (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
MD5_STEP_S (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
MD5_STEP_S (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
MD5_STEP_S (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
MD5_STEP_S (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
MD5_STEP_S (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
MD5_STEP_S (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
MD5_STEP_S (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
MD5_STEP_S (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
MD5_STEP_S (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
MD5_STEP_S (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
MD5_STEP_S (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
MD5_STEP_S (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
MD5_STEP_S (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
MD5_STEP_S (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
MD5_STEP_S (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
MD5_STEP_S (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
MD5_STEP_S (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
MD5_STEP_S (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
MD5_STEP_S (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
MD5_STEP_S (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
MD5_STEP_S (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
MD5_STEP_S (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
MD5_STEP_S (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
MD5_STEP_S (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
MD5_STEP_S (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
MD5_STEP_S (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
MD5_STEP_S (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
MD5_STEP_S (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
MD5_STEP_S (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
MD5_STEP_S (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
MD5_STEP_S (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
MD5_STEP_S (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
MD5_STEP_S (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
MD5_STEP_S (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
MD5_STEP_S (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
MD5_STEP_S (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
MD5_STEP_S (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
MD5_STEP_S (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
MD5_STEP_S (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
MD5_STEP_S (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
MD5_STEP_S (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
MD5_STEP_S (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
MD5_STEP_S (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
MD5_STEP_S (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
MD5_STEP_S (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
MD5_STEP_S (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
MD5_STEP_S (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
MD5_STEP_S (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
MD5_STEP_S (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
MD5_STEP_S (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
MD5_STEP_S (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP_S (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP_S (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
}
void md5_init (md5_ctx_t *ctx)
{
ctx->h[0] = MD5M_A;
ctx->h[1] = MD5M_B;
ctx->h[2] = MD5M_C;
ctx->h[3] = MD5M_D;
ctx->w0[0] = 0;
ctx->w0[1] = 0;
ctx->w0[2] = 0;
ctx->w0[3] = 0;
ctx->w1[0] = 0;
ctx->w1[1] = 0;
ctx->w1[2] = 0;
ctx->w1[3] = 0;
ctx->w2[0] = 0;
ctx->w2[1] = 0;
ctx->w2[2] = 0;
ctx->w2[3] = 0;
ctx->w3[0] = 0;
ctx->w3[1] = 0;
ctx->w3[2] = 0;
ctx->w3[3] = 0;
ctx->len = 0;
}
void md5_update_64 (md5_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len)
{
const int pos = ctx->len & 63;
ctx->len += len;
if ((pos + len) < 64)
{
switch_buffer_by_offset_le_S (w0, w1, w2, w3, pos);
ctx->w0[0] |= w0[0];
ctx->w0[1] |= w0[1];
ctx->w0[2] |= w0[2];
ctx->w0[3] |= w0[3];
ctx->w1[0] |= w1[0];
ctx->w1[1] |= w1[1];
ctx->w1[2] |= w1[2];
ctx->w1[3] |= w1[3];
ctx->w2[0] |= w2[0];
ctx->w2[1] |= w2[1];
ctx->w2[2] |= w2[2];
ctx->w2[3] |= w2[3];
ctx->w3[0] |= w3[0];
ctx->w3[1] |= w3[1];
ctx->w3[2] |= w3[2];
ctx->w3[3] |= w3[3];
}
else
{
u32 c0[4] = { 0 };
u32 c1[4] = { 0 };
u32 c2[4] = { 0 };
u32 c3[4] = { 0 };
switch_buffer_by_offset_carry_le_S (w0, w1, w2, w3, c0, c1, c2, c3, pos);
ctx->w0[0] |= w0[0];
ctx->w0[1] |= w0[1];
ctx->w0[2] |= w0[2];
ctx->w0[3] |= w0[3];
ctx->w1[0] |= w1[0];
ctx->w1[1] |= w1[1];
ctx->w1[2] |= w1[2];
ctx->w1[3] |= w1[3];
ctx->w2[0] |= w2[0];
ctx->w2[1] |= w2[1];
ctx->w2[2] |= w2[2];
ctx->w2[3] |= w2[3];
ctx->w3[0] |= w3[0];
ctx->w3[1] |= w3[1];
ctx->w3[2] |= w3[2];
ctx->w3[3] |= w3[3];
md5_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
ctx->w0[0] = c0[0];
ctx->w0[1] = c0[1];
ctx->w0[2] = c0[2];
ctx->w0[3] = c0[3];
ctx->w1[0] = c1[0];
ctx->w1[1] = c1[1];
ctx->w1[2] = c1[2];
ctx->w1[3] = c1[3];
ctx->w2[0] = c2[0];
ctx->w2[1] = c2[1];
ctx->w2[2] = c2[2];
ctx->w2[3] = c2[3];
ctx->w3[0] = c3[0];
ctx->w3[1] = c3[1];
ctx->w3[2] = c3[2];
ctx->w3[3] = c3[3];
}
}
void md5_update (md5_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_64 (ctx, w0, w1, w2, w3, 64);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_64 (ctx, w0, w1, w2, w3, len - pos1);
}
void md5_update_swap (md5_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, 64);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, len - pos1);
}
void md5_update_utf16le (md5_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
md5_update_64 (ctx, w0, w1, w2, w3, 32 * 2);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
md5_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2);
}
void md5_update_utf16le_swap (md5_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, 32 * 2);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2);
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
void md5_update_global (md5_ctx_t *ctx, const __global u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_64 (ctx, w0, w1, w2, w3, 64);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_64 (ctx, w0, w1, w2, w3, len - pos1);
}
void md5_update_global_swap (md5_ctx_t *ctx, const __global u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, 64);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, len - pos1);
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
void md5_update_global_utf16le (md5_ctx_t *ctx, const __global u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
md5_update_64 (ctx, w0, w1, w2, w3, 32 * 2);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
md5_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2);
}
void md5_update_global_utf16le_swap (md5_ctx_t *ctx, const __global u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, 32 * 2);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le_S (w1, w2, w3);
make_utf16le_S (w0, w0, w1);
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
w0[3] = swap32_S (w0[3]);
w1[0] = swap32_S (w1[0]);
w1[1] = swap32_S (w1[1]);
w1[2] = swap32_S (w1[2]);
w1[3] = swap32_S (w1[3]);
w2[0] = swap32_S (w2[0]);
w2[1] = swap32_S (w2[1]);
w2[2] = swap32_S (w2[2]);
w2[3] = swap32_S (w2[3]);
w3[0] = swap32_S (w3[0]);
w3[1] = swap32_S (w3[1]);
w3[2] = swap32_S (w3[2]);
w3[3] = swap32_S (w3[3]);
md5_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2);
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
void md5_final (md5_ctx_t *ctx)
{
const int pos = ctx->len & 63;
append_0x80_4x4_S (ctx->w0, ctx->w1, ctx->w2, ctx->w3, pos);
if (pos >= 56)
{
md5_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
ctx->w0[0] = 0;
ctx->w0[1] = 0;
ctx->w0[2] = 0;
ctx->w0[3] = 0;
ctx->w1[0] = 0;
ctx->w1[1] = 0;
ctx->w1[2] = 0;
ctx->w1[3] = 0;
ctx->w2[0] = 0;
ctx->w2[1] = 0;
ctx->w2[2] = 0;
ctx->w2[3] = 0;
ctx->w3[0] = 0;
ctx->w3[1] = 0;
ctx->w3[2] = 0;
ctx->w3[3] = 0;
}
ctx->w3[2] = ctx->len * 8;
ctx->w3[3] = 0;
md5_transform (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
}
// md5_hmac
typedef struct md5_hmac_ctx
{
md5_ctx_t ipad;
md5_ctx_t opad;
} md5_hmac_ctx_t;
void md5_hmac_init_64 (md5_hmac_ctx_t *ctx, const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4])
{
u32 t0[4];
u32 t1[4];
u32 t2[4];
u32 t3[4];
// ipad
t0[0] = w0[0] ^ 0x36363636;
t0[1] = w0[1] ^ 0x36363636;
t0[2] = w0[2] ^ 0x36363636;
t0[3] = w0[3] ^ 0x36363636;
t1[0] = w1[0] ^ 0x36363636;
t1[1] = w1[1] ^ 0x36363636;
t1[2] = w1[2] ^ 0x36363636;
t1[3] = w1[3] ^ 0x36363636;
t2[0] = w2[0] ^ 0x36363636;
t2[1] = w2[1] ^ 0x36363636;
t2[2] = w2[2] ^ 0x36363636;
t2[3] = w2[3] ^ 0x36363636;
t3[0] = w3[0] ^ 0x36363636;
t3[1] = w3[1] ^ 0x36363636;
t3[2] = w3[2] ^ 0x36363636;
t3[3] = w3[3] ^ 0x36363636;
md5_init (&ctx->ipad);
md5_update_64 (&ctx->ipad, t0, t1, t2, t3, 64);
// opad
t0[0] = w0[0] ^ 0x5c5c5c5c;
t0[1] = w0[1] ^ 0x5c5c5c5c;
t0[2] = w0[2] ^ 0x5c5c5c5c;
t0[3] = w0[3] ^ 0x5c5c5c5c;
t1[0] = w1[0] ^ 0x5c5c5c5c;
t1[1] = w1[1] ^ 0x5c5c5c5c;
t1[2] = w1[2] ^ 0x5c5c5c5c;
t1[3] = w1[3] ^ 0x5c5c5c5c;
t2[0] = w2[0] ^ 0x5c5c5c5c;
t2[1] = w2[1] ^ 0x5c5c5c5c;
t2[2] = w2[2] ^ 0x5c5c5c5c;
t2[3] = w2[3] ^ 0x5c5c5c5c;
t3[0] = w3[0] ^ 0x5c5c5c5c;
t3[1] = w3[1] ^ 0x5c5c5c5c;
t3[2] = w3[2] ^ 0x5c5c5c5c;
t3[3] = w3[3] ^ 0x5c5c5c5c;
md5_init (&ctx->opad);
md5_update_64 (&ctx->opad, t0, t1, t2, t3, 64);
}
void md5_hmac_init (md5_hmac_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
if (len > 64)
{
md5_ctx_t tmp;
md5_init (&tmp);
md5_update (&tmp, w, len);
md5_final (&tmp);
w0[0] = tmp.h[0];
w0[1] = tmp.h[1];
w0[2] = tmp.h[2];
w0[3] = tmp.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
}
else
{
w0[0] = w[ 0];
w0[1] = w[ 1];
w0[2] = w[ 2];
w0[3] = w[ 3];
w1[0] = w[ 4];
w1[1] = w[ 5];
w1[2] = w[ 6];
w1[3] = w[ 7];
w2[0] = w[ 8];
w2[1] = w[ 9];
w2[2] = w[10];
w2[3] = w[11];
w3[0] = w[12];
w3[1] = w[13];
w3[2] = w[14];
w3[3] = w[15];
}
md5_hmac_init_64 (ctx, w0, w1, w2, w3);
}
void md5_hmac_init_swap (md5_hmac_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
if (len > 64)
{
md5_ctx_t tmp;
md5_init (&tmp);
md5_update_swap (&tmp, w, len);
md5_final (&tmp);
w0[0] = tmp.h[0];
w0[1] = tmp.h[1];
w0[2] = tmp.h[2];
w0[3] = tmp.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
}
else
{
w0[0] = swap32_S (w[ 0]);
w0[1] = swap32_S (w[ 1]);
w0[2] = swap32_S (w[ 2]);
w0[3] = swap32_S (w[ 3]);
w1[0] = swap32_S (w[ 4]);
w1[1] = swap32_S (w[ 5]);
w1[2] = swap32_S (w[ 6]);
w1[3] = swap32_S (w[ 7]);
w2[0] = swap32_S (w[ 8]);
w2[1] = swap32_S (w[ 9]);
w2[2] = swap32_S (w[10]);
w2[3] = swap32_S (w[11]);
w3[0] = swap32_S (w[12]);
w3[1] = swap32_S (w[13]);
w3[2] = swap32_S (w[14]);
w3[3] = swap32_S (w[15]);
}
md5_hmac_init_64 (ctx, w0, w1, w2, w3);
}
void md5_hmac_init_global (md5_hmac_ctx_t *ctx, __global const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
if (len > 64)
{
md5_ctx_t tmp;
md5_init (&tmp);
md5_update_global (&tmp, w, len);
md5_final (&tmp);
w0[0] = tmp.h[0];
w0[1] = tmp.h[1];
w0[2] = tmp.h[2];
w0[3] = tmp.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
}
else
{
w0[0] = w[ 0];
w0[1] = w[ 1];
w0[2] = w[ 2];
w0[3] = w[ 3];
w1[0] = w[ 4];
w1[1] = w[ 5];
w1[2] = w[ 6];
w1[3] = w[ 7];
w2[0] = w[ 8];
w2[1] = w[ 9];
w2[2] = w[10];
w2[3] = w[11];
w3[0] = w[12];
w3[1] = w[13];
w3[2] = w[14];
w3[3] = w[15];
}
md5_hmac_init_64 (ctx, w0, w1, w2, w3);
}
void md5_hmac_init_global_swap (md5_hmac_ctx_t *ctx, __global const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
if (len > 64)
{
md5_ctx_t tmp;
md5_init (&tmp);
md5_update_global_swap (&tmp, w, len);
md5_final (&tmp);
w0[0] = tmp.h[0];
w0[1] = tmp.h[1];
w0[2] = tmp.h[2];
w0[3] = tmp.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
}
else
{
w0[0] = swap32_S (w[ 0]);
w0[1] = swap32_S (w[ 1]);
w0[2] = swap32_S (w[ 2]);
w0[3] = swap32_S (w[ 3]);
w1[0] = swap32_S (w[ 4]);
w1[1] = swap32_S (w[ 5]);
w1[2] = swap32_S (w[ 6]);
w1[3] = swap32_S (w[ 7]);
w2[0] = swap32_S (w[ 8]);
w2[1] = swap32_S (w[ 9]);
w2[2] = swap32_S (w[10]);
w2[3] = swap32_S (w[11]);
w3[0] = swap32_S (w[12]);
w3[1] = swap32_S (w[13]);
w3[2] = swap32_S (w[14]);
w3[3] = swap32_S (w[15]);
}
md5_hmac_init_64 (ctx, w0, w1, w2, w3);
}
void md5_hmac_update_64 (md5_hmac_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len)
{
md5_update_64 (&ctx->ipad, w0, w1, w2, w3, len);
}
void md5_hmac_update (md5_hmac_ctx_t *ctx, const u32 *w, const int len)
{
md5_update (&ctx->ipad, w, len);
}
void md5_hmac_update_global (md5_hmac_ctx_t *ctx, const __global u32 *w, const int len)
{
md5_update_global (&ctx->ipad, w, len);
}
void md5_hmac_update_global_swap (md5_hmac_ctx_t *ctx, const __global u32 *w, const int len)
{
md5_update_global_swap (&ctx->ipad, w, len);
}
void md5_hmac_update_global_utf16le (md5_hmac_ctx_t *ctx, const __global u32 *w, const int len)
{
md5_update_global_utf16le (&ctx->ipad, w, len);
}
void md5_hmac_update_global_utf16le_swap (md5_hmac_ctx_t *ctx, const __global u32 *w, const int len)
{
md5_update_global_utf16le_swap (&ctx->ipad, w, len);
}
void md5_hmac_final (md5_hmac_ctx_t *ctx)
{
md5_final (&ctx->ipad);
u32 t0[4];
u32 t1[4];
u32 t2[4];
u32 t3[4];
t0[0] = ctx->ipad.h[0];
t0[1] = ctx->ipad.h[1];
t0[2] = ctx->ipad.h[2];
t0[3] = ctx->ipad.h[3];
t1[0] = 0;
t1[1] = 0;
t1[2] = 0;
t1[3] = 0;
t2[0] = 0;
t2[1] = 0;
t2[2] = 0;
t2[3] = 0;
t3[0] = 0;
t3[1] = 0;
t3[2] = 0;
t3[3] = 0;
md5_update_64 (&ctx->opad, t0, t1, t2, t3, 16);
md5_final (&ctx->opad);
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
// while input buf can be a vector datatype, the length of the different elements can not
typedef struct md5_ctx_vector
{
u32x h[4];
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
int len;
} md5_ctx_vector_t;
void md5_transform_vector (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4])
{
u32x a = digest[0];
u32x b = digest[1];
u32x c = digest[2];
u32x d = digest[3];
u32x w0_t = w0[0];
u32x w1_t = w0[1];
u32x w2_t = w0[2];
u32x w3_t = w0[3];
u32x w4_t = w1[0];
u32x w5_t = w1[1];
u32x w6_t = w1[2];
u32x w7_t = w1[3];
u32x w8_t = w2[0];
u32x w9_t = w2[1];
u32x wa_t = w2[2];
u32x wb_t = w2[3];
u32x wc_t = w3[0];
u32x wd_t = w3[1];
u32x we_t = w3[2];
u32x wf_t = w3[3];
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
}
void md5_init_vector (md5_ctx_vector_t *ctx)
{
ctx->h[0] = MD5M_A;
ctx->h[1] = MD5M_B;
ctx->h[2] = MD5M_C;
ctx->h[3] = MD5M_D;
ctx->w0[0] = 0;
ctx->w0[1] = 0;
ctx->w0[2] = 0;
ctx->w0[3] = 0;
ctx->w1[0] = 0;
ctx->w1[1] = 0;
ctx->w1[2] = 0;
ctx->w1[3] = 0;
ctx->w2[0] = 0;
ctx->w2[1] = 0;
ctx->w2[2] = 0;
ctx->w2[3] = 0;
ctx->w3[0] = 0;
ctx->w3[1] = 0;
ctx->w3[2] = 0;
ctx->w3[3] = 0;
ctx->len = 0;
}
void md5_init_vector_from_scalar (md5_ctx_vector_t *ctx, md5_ctx_t *ctx0)
{
ctx->h[0] = ctx0->h[0];
ctx->h[1] = ctx0->h[1];
ctx->h[2] = ctx0->h[2];
ctx->h[3] = ctx0->h[3];
ctx->w0[0] = ctx0->w0[0];
ctx->w0[1] = ctx0->w0[1];
ctx->w0[2] = ctx0->w0[2];
ctx->w0[3] = ctx0->w0[3];
ctx->w1[0] = ctx0->w1[0];
ctx->w1[1] = ctx0->w1[1];
ctx->w1[2] = ctx0->w1[2];
ctx->w1[3] = ctx0->w1[3];
ctx->w2[0] = ctx0->w2[0];
ctx->w2[1] = ctx0->w2[1];
ctx->w2[2] = ctx0->w2[2];
ctx->w2[3] = ctx0->w2[3];
ctx->w3[0] = ctx0->w3[0];
ctx->w3[1] = ctx0->w3[1];
ctx->w3[2] = ctx0->w3[2];
ctx->w3[3] = ctx0->w3[3];
ctx->len = ctx0->len;
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
void md5_update_vector_64 (md5_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len)
{
const int pos = ctx->len & 63;
ctx->len += len;
if ((pos + len) < 64)
{
switch_buffer_by_offset_le (w0, w1, w2, w3, pos);
ctx->w0[0] |= w0[0];
ctx->w0[1] |= w0[1];
ctx->w0[2] |= w0[2];
ctx->w0[3] |= w0[3];
ctx->w1[0] |= w1[0];
ctx->w1[1] |= w1[1];
ctx->w1[2] |= w1[2];
ctx->w1[3] |= w1[3];
ctx->w2[0] |= w2[0];
ctx->w2[1] |= w2[1];
ctx->w2[2] |= w2[2];
ctx->w2[3] |= w2[3];
ctx->w3[0] |= w3[0];
ctx->w3[1] |= w3[1];
ctx->w3[2] |= w3[2];
ctx->w3[3] |= w3[3];
}
else
{
u32x c0[4] = { 0 };
u32x c1[4] = { 0 };
u32x c2[4] = { 0 };
u32x c3[4] = { 0 };
switch_buffer_by_offset_carry_le (w0, w1, w2, w3, c0, c1, c2, c3, pos);
ctx->w0[0] |= w0[0];
ctx->w0[1] |= w0[1];
ctx->w0[2] |= w0[2];
ctx->w0[3] |= w0[3];
ctx->w1[0] |= w1[0];
ctx->w1[1] |= w1[1];
ctx->w1[2] |= w1[2];
ctx->w1[3] |= w1[3];
ctx->w2[0] |= w2[0];
ctx->w2[1] |= w2[1];
ctx->w2[2] |= w2[2];
ctx->w2[3] |= w2[3];
ctx->w3[0] |= w3[0];
ctx->w3[1] |= w3[1];
ctx->w3[2] |= w3[2];
ctx->w3[3] |= w3[3];
md5_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
ctx->w0[0] = c0[0];
ctx->w0[1] = c0[1];
ctx->w0[2] = c0[2];
ctx->w0[3] = c0[3];
ctx->w1[0] = c1[0];
ctx->w1[1] = c1[1];
ctx->w1[2] = c1[2];
ctx->w1[3] = c1[3];
ctx->w2[0] = c2[0];
ctx->w2[1] = c2[1];
ctx->w2[2] = c2[2];
ctx->w2[3] = c2[3];
ctx->w3[0] = c3[0];
ctx->w3[1] = c3[1];
ctx->w3[2] = c3[2];
ctx->w3[3] = c3[3];
}
}
void md5_update_vector (md5_ctx_vector_t *ctx, const u32x *w, const int len)
{
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_vector_64 (ctx, w0, w1, w2, w3, 64);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
md5_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1);
}
void md5_update_vector_utf16le (md5_ctx_vector_t *ctx, const u32x *w, const int len)
{
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le (w1, w2, w3);
make_utf16le (w0, w0, w1);
md5_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
make_utf16le (w1, w2, w3);
make_utf16le (w0, w0, w1);
md5_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2);
}
Converted -m 400 to password length 256 support Something weird happend here, read on! I've expected some performance drop because this algorithm is using the password data itself inside the iteration loop. That is different to PBKDF2, which I've converted in mode 2100 before and which did not show any performance as expected. So after I've finished converting this kernel and testing everything works using the unit test, I did some benchmarks to see how much the performance drop is. On my 750ti, the speed dropped (minimal) from 981kH/s -> 948kH/s, that's mostly because of the SIMD support i had to drop. If I'd turn off the SIMD support in the original, the drop would be even less, that us 967kH/s -> 948kH/s which is a bit of a more reasable comparison in case we just want to rate the drop that is actually caused by the code change itself. The drop was acceptable for me, so I've decided to check on my GTX1080.Now the weird thing: The performance increased from 6619kH/s to 7134kH/s!! When I gave it a second thought, it turned out that: 1. The GTX1080 is a scalar GPU so it wont suffer from the drop of the SIMD code as the 750ti did 2. There's a change in how the global data (password) is read into the registers, it reads only that amount of data it actually needs by using the pw_len information 3. I've added a barrier for CLK_GLOBAL_MEM_FENCE as it turned out to increase the performance in the 750ti Note that this kernel is now branched into password length < 40 and larger. There's a large drop on performance where SIMD is really important, for example CPU. We could workaround this issue by sticking to SIMD inside the length < 40 branch, but I don't know yet how this can be done efficiently.
7 years ago
void md5_final_vector (md5_ctx_vector_t *ctx)
{
const int pos = ctx->len & 63;
append_0x80_4x4 (ctx->w0, ctx->w1, ctx->w2, ctx->w3, pos);
if (pos >= 56)
{
md5_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
ctx->w0[0] = 0;
ctx->w0[1] = 0;
ctx->w0[2] = 0;
ctx->w0[3] = 0;
ctx->w1[0] = 0;
ctx->w1[1] = 0;
ctx->w1[2] = 0;
ctx->w1[3] = 0;
ctx->w2[0] = 0;
ctx->w2[1] = 0;
ctx->w2[2] = 0;
ctx->w2[3] = 0;
ctx->w3[0] = 0;
ctx->w3[1] = 0;
ctx->w3[2] = 0;
ctx->w3[3] = 0;
}
ctx->w3[2] = ctx->len * 8;
ctx->w3[3] = 0;
md5_transform_vector (ctx->w0, ctx->w1, ctx->w2, ctx->w3, ctx->h);
}
// HMAC + Vector
typedef struct md5_hmac_ctx_vector
{
md5_ctx_vector_t ipad;
md5_ctx_vector_t opad;
} md5_hmac_ctx_vector_t;
void md5_hmac_init_vector_64 (md5_hmac_ctx_vector_t *ctx, const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4])
{
u32x t0[4];
u32x t1[4];
u32x t2[4];
u32x t3[4];
// ipad
t0[0] = w0[0] ^ 0x36363636;
t0[1] = w0[1] ^ 0x36363636;
t0[2] = w0[2] ^ 0x36363636;
t0[3] = w0[3] ^ 0x36363636;
t1[0] = w1[0] ^ 0x36363636;
t1[1] = w1[1] ^ 0x36363636;
t1[2] = w1[2] ^ 0x36363636;
t1[3] = w1[3] ^ 0x36363636;
t2[0] = w2[0] ^ 0x36363636;
t2[1] = w2[1] ^ 0x36363636;
t2[2] = w2[2] ^ 0x36363636;
t2[3] = w2[3] ^ 0x36363636;
t3[0] = w3[0] ^ 0x36363636;
t3[1] = w3[1] ^ 0x36363636;
t3[2] = w3[2] ^ 0x36363636;
t3[3] = w3[3] ^ 0x36363636;
md5_init_vector (&ctx->ipad);
md5_update_vector_64 (&ctx->ipad, t0, t1, t2, t3, 64);
// opad
t0[0] = w0[0] ^ 0x5c5c5c5c;
t0[1] = w0[1] ^ 0x5c5c5c5c;
t0[2] = w0[2] ^ 0x5c5c5c5c;
t0[3] = w0[3] ^ 0x5c5c5c5c;
t1[0] = w1[0] ^ 0x5c5c5c5c;
t1[1] = w1[1] ^ 0x5c5c5c5c;
t1[2] = w1[2] ^ 0x5c5c5c5c;
t1[3] = w1[3] ^ 0x5c5c5c5c;
t2[0] = w2[0] ^ 0x5c5c5c5c;
t2[1] = w2[1] ^ 0x5c5c5c5c;
t2[2] = w2[2] ^ 0x5c5c5c5c;
t2[3] = w2[3] ^ 0x5c5c5c5c;
t3[0] = w3[0] ^ 0x5c5c5c5c;
t3[1] = w3[1] ^ 0x5c5c5c5c;
t3[2] = w3[2] ^ 0x5c5c5c5c;
t3[3] = w3[3] ^ 0x5c5c5c5c;
md5_init_vector (&ctx->opad);
md5_update_vector_64 (&ctx->opad, t0, t1, t2, t3, 64);
}
void md5_hmac_init_vector (md5_hmac_ctx_vector_t *ctx, const u32x *w, const int len)
{
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
if (len > 64)
{
md5_ctx_vector_t tmp;
md5_init_vector (&tmp);
md5_update_vector (&tmp, w, len);
md5_final_vector (&tmp);
w0[0] = tmp.h[0];
w0[1] = tmp.h[1];
w0[2] = tmp.h[2];
w0[3] = tmp.h[3];
w1[0] = 0;
w1[1] = 0;
w1[2] = 0;
w1[3] = 0;
w2[0] = 0;
w2[1] = 0;
w2[2] = 0;
w2[3] = 0;
w3[0] = 0;
w3[1] = 0;
w3[2] = 0;
w3[3] = 0;
}
else
{
w0[0] = w[ 0];
w0[1] = w[ 1];
w0[2] = w[ 2];
w0[3] = w[ 3];
w1[0] = w[ 4];
w1[1] = w[ 5];
w1[2] = w[ 6];
w1[3] = w[ 7];
w2[0] = w[ 8];
w2[1] = w[ 9];
w2[2] = w[10];
w2[3] = w[11];
w3[0] = w[12];
w3[1] = w[13];
w3[2] = w[14];
w3[3] = w[15];
}
md5_hmac_init_vector_64 (ctx, w0, w1, w2, w3);
}
void md5_hmac_update_vector_64 (md5_hmac_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len)
{
md5_update_vector_64 (&ctx->ipad, w0, w1, w2, w3, len);
}
void md5_hmac_update_vector (md5_hmac_ctx_vector_t *ctx, const u32x *w, const int len)
{
md5_update_vector (&ctx->ipad, w, len);
}
void md5_hmac_final_vector (md5_hmac_ctx_vector_t *ctx)
{
md5_final_vector (&ctx->ipad);
u32x t0[4];
u32x t1[4];
u32x t2[4];
u32x t3[4];
t0[0] = ctx->ipad.h[0];
t0[1] = ctx->ipad.h[1];
t0[2] = ctx->ipad.h[2];
t0[3] = ctx->ipad.h[3];
t1[0] = 0;
t1[1] = 0;
t1[2] = 0;
t1[3] = 0;
t2[0] = 0;
t2[1] = 0;
t2[2] = 0;
t2[3] = 0;
t3[0] = 0;
t3[1] = 0;
t3[2] = 0;
t3[3] = 0;
md5_update_vector_64 (&ctx->opad, t0, t1, t2, t3, 16);
md5_final_vector (&ctx->opad);
}