1
0
mirror of https://github.com/hashcat/hashcat.git synced 2024-11-17 21:59:06 +00:00
hashcat/OpenCL/inc_hash_blake2b.cl

868 lines
23 KiB
Common Lisp
Raw Normal View History

/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.h"
#include "inc_common.h"
#include "inc_hash_blake2b.h"
DECLSPEC u64 blake2b_rot16_S (const u64 a)
{
#if defined IS_NV
vconv64_t in;
in.v64 = a;
vconv64_t out;
out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x1076);
out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x5432);
return out.v64;
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
vconv64_t in;
in.v64 = a;
vconv64_t out;
out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x01000706);
out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x05040302);
return out.v64;
#else
return hc_rotr64_S (a, 16);
#endif
}
DECLSPEC u64x blake2b_rot16 (const u64x a)
{
u64x r;
#if VECT_SIZE == 1
r = blake2b_rot16_S (a);
#endif
#if VECT_SIZE >= 2
r.s0 = blake2b_rot16_S (a.s0);
r.s1 = blake2b_rot16_S (a.s1);
#endif
#if VECT_SIZE >= 4
r.s2 = blake2b_rot16_S (a.s2);
r.s3 = blake2b_rot16_S (a.s3);
#endif
#if VECT_SIZE >= 8
r.s4 = blake2b_rot16_S (a.s4);
r.s5 = blake2b_rot16_S (a.s5);
r.s6 = blake2b_rot16_S (a.s6);
r.s7 = blake2b_rot16_S (a.s7);
#endif
#if VECT_SIZE >= 16
r.s8 = blake2b_rot16_S (a.s8);
r.s9 = blake2b_rot16_S (a.s9);
r.sa = blake2b_rot16_S (a.sa);
r.sb = blake2b_rot16_S (a.sb);
r.sc = blake2b_rot16_S (a.sc);
r.sd = blake2b_rot16_S (a.sd);
r.se = blake2b_rot16_S (a.se);
r.sf = blake2b_rot16_S (a.sf);
#endif
return r;
}
DECLSPEC u64 blake2b_rot24_S (const u64 a)
{
#if defined IS_NV
vconv64_t in;
in.v64 = a;
vconv64_t out;
out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x2107);
out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x6543);
return out.v64;
#elif (defined IS_AMD || defined IS_HIP) && HAS_VPERM
vconv64_t in;
in.v64 = a;
vconv64_t out;
out.v32.a = hc_byte_perm_S (in.v32.b, in.v32.a, 0x02010007);
out.v32.b = hc_byte_perm_S (in.v32.b, in.v32.a, 0x06050403);
return out.v64;
#else
return hc_rotr64_S (a, 16);
#endif
}
DECLSPEC u64x blake2b_rot24 (const u64x a)
{
u64x r;
#if VECT_SIZE == 1
r = blake2b_rot24_S (a);
#endif
#if VECT_SIZE >= 2
r.s0 = blake2b_rot24_S (a.s0);
r.s1 = blake2b_rot24_S (a.s1);
#endif
#if VECT_SIZE >= 4
r.s2 = blake2b_rot24_S (a.s2);
r.s3 = blake2b_rot24_S (a.s3);
#endif
#if VECT_SIZE >= 8
r.s4 = blake2b_rot24_S (a.s4);
r.s5 = blake2b_rot24_S (a.s5);
r.s6 = blake2b_rot24_S (a.s6);
r.s7 = blake2b_rot24_S (a.s7);
#endif
#if VECT_SIZE >= 16
r.s8 = blake2b_rot24_S (a.s8);
r.s9 = blake2b_rot24_S (a.s9);
r.sa = blake2b_rot24_S (a.sa);
r.sb = blake2b_rot24_S (a.sb);
r.sc = blake2b_rot24_S (a.sc);
r.sd = blake2b_rot24_S (a.sd);
r.se = blake2b_rot24_S (a.se);
r.sf = blake2b_rot24_S (a.sf);
#endif
return r;
}
DECLSPEC u64 blake2b_rot32_S (const u64 a)
{
vconv64_t in;
in.v64 = a;
vconv64_t out;
out.v32.a = in.v32.b;
out.v32.b = in.v32.a;
return out.v64;
}
DECLSPEC u64x blake2b_rot32 (const u64x a)
{
u64x r;
#if VECT_SIZE == 1
r = blake2b_rot32_S (a);
#endif
#if VECT_SIZE >= 2
r.s0 = blake2b_rot32_S (a.s0);
r.s1 = blake2b_rot32_S (a.s1);
#endif
#if VECT_SIZE >= 4
r.s2 = blake2b_rot32_S (a.s2);
r.s3 = blake2b_rot32_S (a.s3);
#endif
#if VECT_SIZE >= 8
r.s4 = blake2b_rot32_S (a.s4);
r.s5 = blake2b_rot32_S (a.s5);
r.s6 = blake2b_rot32_S (a.s6);
r.s7 = blake2b_rot32_S (a.s7);
#endif
#if VECT_SIZE >= 16
r.s8 = blake2b_rot32_S (a.s8);
r.s9 = blake2b_rot32_S (a.s9);
r.sa = blake2b_rot32_S (a.sa);
r.sb = blake2b_rot32_S (a.sb);
r.sc = blake2b_rot32_S (a.sc);
r.sd = blake2b_rot32_S (a.sd);
r.se = blake2b_rot32_S (a.se);
r.sf = blake2b_rot32_S (a.sf);
#endif
return r;
}
DECLSPEC void blake2b_transform (u64 *h, const u64 *m, const int len, const u64 f0)
{
const u64 t0 = hl32_to_64_S (0, len);
u64 v[16];
v[ 0] = h[0];
v[ 1] = h[1];
v[ 2] = h[2];
v[ 3] = h[3];
v[ 4] = h[4];
v[ 5] = h[5];
v[ 6] = h[6];
v[ 7] = h[7];
v[ 8] = BLAKE2B_IV_00;
v[ 9] = BLAKE2B_IV_01;
v[10] = BLAKE2B_IV_02;
v[11] = BLAKE2B_IV_03;
v[12] = BLAKE2B_IV_04 ^ t0;
v[13] = BLAKE2B_IV_05; // ^ t1;
v[14] = BLAKE2B_IV_06 ^ f0;
v[15] = BLAKE2B_IV_07; // ^ f1;
BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3);
BLAKE2B_ROUND (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4);
BLAKE2B_ROUND ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8);
BLAKE2B_ROUND ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13);
BLAKE2B_ROUND ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9);
BLAKE2B_ROUND (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11);
BLAKE2B_ROUND (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10);
BLAKE2B_ROUND ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5);
BLAKE2B_ROUND (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0);
BLAKE2B_ROUND ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
BLAKE2B_ROUND (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3);
h[0] = h[0] ^ v[0] ^ v[ 8];
h[1] = h[1] ^ v[1] ^ v[ 9];
h[2] = h[2] ^ v[2] ^ v[10];
h[3] = h[3] ^ v[3] ^ v[11];
h[4] = h[4] ^ v[4] ^ v[12];
h[5] = h[5] ^ v[5] ^ v[13];
h[6] = h[6] ^ v[6] ^ v[14];
h[7] = h[7] ^ v[7] ^ v[15];
}
DECLSPEC void blake2b_init (blake2b_ctx_t *ctx)
{
ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes
ctx->h[1] = BLAKE2B_IV_01;
ctx->h[2] = BLAKE2B_IV_02;
ctx->h[3] = BLAKE2B_IV_03;
ctx->h[4] = BLAKE2B_IV_04;
ctx->h[5] = BLAKE2B_IV_05;
ctx->h[6] = BLAKE2B_IV_06;
ctx->h[7] = BLAKE2B_IV_07;
ctx->m[ 0] = 0;
ctx->m[ 1] = 0;
ctx->m[ 2] = 0;
ctx->m[ 3] = 0;
ctx->m[ 4] = 0;
ctx->m[ 5] = 0;
ctx->m[ 6] = 0;
ctx->m[ 7] = 0;
ctx->m[ 8] = 0;
ctx->m[ 9] = 0;
ctx->m[10] = 0;
ctx->m[11] = 0;
ctx->m[12] = 0;
ctx->m[13] = 0;
ctx->m[14] = 0;
ctx->m[15] = 0;
ctx->len = 0;
}
DECLSPEC void blake2b_update_128 (blake2b_ctx_t *ctx, u32 *w0, u32 *w1, u32 *w2, u32 *w3, u32 *w4, u32 *w5, u32 *w6, u32 *w7, const int len)
{
if (len == 0) return;
const int pos = ctx->len & 127;
if (pos == 0)
{
if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform
{
blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_UPDATE);
}
ctx->m[ 0] = hl32_to_64_S (w0[1], w0[0]);
ctx->m[ 1] = hl32_to_64_S (w0[3], w0[2]);
ctx->m[ 2] = hl32_to_64_S (w1[1], w1[0]);
ctx->m[ 3] = hl32_to_64_S (w1[3], w1[2]);
ctx->m[ 4] = hl32_to_64_S (w2[1], w2[0]);
ctx->m[ 5] = hl32_to_64_S (w2[3], w2[2]);
ctx->m[ 6] = hl32_to_64_S (w3[1], w3[0]);
ctx->m[ 7] = hl32_to_64_S (w3[3], w3[2]);
ctx->m[ 8] = hl32_to_64_S (w4[1], w4[0]);
ctx->m[ 9] = hl32_to_64_S (w4[3], w4[2]);
ctx->m[10] = hl32_to_64_S (w5[1], w5[0]);
ctx->m[11] = hl32_to_64_S (w5[3], w5[2]);
ctx->m[12] = hl32_to_64_S (w6[1], w6[0]);
ctx->m[13] = hl32_to_64_S (w6[3], w6[2]);
ctx->m[14] = hl32_to_64_S (w7[1], w7[0]);
ctx->m[15] = hl32_to_64_S (w7[3], w7[2]);
}
else
{
if ((pos + len) <= 128)
{
switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, pos);
ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]);
ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]);
ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]);
ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]);
ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]);
ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]);
ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]);
ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]);
ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]);
ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]);
ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]);
ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]);
ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]);
ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]);
ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]);
ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]);
}
else
{
u32 c0[4] = { 0 };
u32 c1[4] = { 0 };
u32 c2[4] = { 0 };
u32 c3[4] = { 0 };
u32 c4[4] = { 0 };
u32 c5[4] = { 0 };
u32 c6[4] = { 0 };
u32 c7[4] = { 0 };
switch_buffer_by_offset_8x4_carry_le_S (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos);
ctx->m[ 0] |= hl32_to_64_S (w0[1], w0[0]);
ctx->m[ 1] |= hl32_to_64_S (w0[3], w0[2]);
ctx->m[ 2] |= hl32_to_64_S (w1[1], w1[0]);
ctx->m[ 3] |= hl32_to_64_S (w1[3], w1[2]);
ctx->m[ 4] |= hl32_to_64_S (w2[1], w2[0]);
ctx->m[ 5] |= hl32_to_64_S (w2[3], w2[2]);
ctx->m[ 6] |= hl32_to_64_S (w3[1], w3[0]);
ctx->m[ 7] |= hl32_to_64_S (w3[3], w3[2]);
ctx->m[ 8] |= hl32_to_64_S (w4[1], w4[0]);
ctx->m[ 9] |= hl32_to_64_S (w4[3], w4[2]);
ctx->m[10] |= hl32_to_64_S (w5[1], w5[0]);
ctx->m[11] |= hl32_to_64_S (w5[3], w5[2]);
ctx->m[12] |= hl32_to_64_S (w6[1], w6[0]);
ctx->m[13] |= hl32_to_64_S (w6[3], w6[2]);
ctx->m[14] |= hl32_to_64_S (w7[1], w7[0]);
ctx->m[15] |= hl32_to_64_S (w7[3], w7[2]);
// len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE:
const u32 cur_len = ((ctx->len + len) / 128) * 128;
blake2b_transform (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE);
ctx->m[ 0] = hl32_to_64_S (c0[1], c0[0]);
ctx->m[ 1] = hl32_to_64_S (c0[3], c0[2]);
ctx->m[ 2] = hl32_to_64_S (c1[1], c1[0]);
ctx->m[ 3] = hl32_to_64_S (c1[3], c1[2]);
ctx->m[ 4] = hl32_to_64_S (c2[1], c2[0]);
ctx->m[ 5] = hl32_to_64_S (c2[3], c2[2]);
ctx->m[ 6] = hl32_to_64_S (c3[1], c3[0]);
ctx->m[ 7] = hl32_to_64_S (c3[3], c3[2]);
ctx->m[ 8] = hl32_to_64_S (c4[1], c4[0]);
ctx->m[ 9] = hl32_to_64_S (c4[3], c4[2]);
ctx->m[10] = hl32_to_64_S (c5[1], c5[0]);
ctx->m[11] = hl32_to_64_S (c5[3], c5[2]);
ctx->m[12] = hl32_to_64_S (c6[1], c6[0]);
ctx->m[13] = hl32_to_64_S (c6[3], c6[2]);
ctx->m[14] = hl32_to_64_S (c7[1], c7[0]);
ctx->m[15] = hl32_to_64_S (c7[3], c7[2]);
}
}
ctx->len += len;
}
DECLSPEC void blake2b_update (blake2b_ctx_t *ctx, const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
const int limit = (const int) len - 128; // int type needed, could be negative
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
}
DECLSPEC void blake2b_update_global (blake2b_ctx_t *ctx, GLOBAL_AS const u32 *w, const int len)
{
u32 w0[4];
u32 w1[4];
u32 w2[4];
u32 w3[4];
u32 w4[4];
u32 w5[4];
u32 w6[4];
u32 w7[4];
const int limit = (const int) len - 128; // int type needed, could be negative
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
}
DECLSPEC void blake2b_final (blake2b_ctx_t *ctx)
{
blake2b_transform (ctx->h, ctx->m, ctx->len, BLAKE2B_FINAL);
}
DECLSPEC void blake2b_transform_vector (u64x *h, const u64x *m, const u32x len, const u64 f0)
{
const u64x t0 = hl32_to_64 (0, len);
u64x v[16];
v[ 0] = h[0];
v[ 1] = h[1];
v[ 2] = h[2];
v[ 3] = h[3];
v[ 4] = h[4];
v[ 5] = h[5];
v[ 6] = h[6];
v[ 7] = h[7];
v[ 8] = BLAKE2B_IV_00;
v[ 9] = BLAKE2B_IV_01;
v[10] = BLAKE2B_IV_02;
v[11] = BLAKE2B_IV_03;
v[12] = BLAKE2B_IV_04 ^ t0;
v[13] = BLAKE2B_IV_05; // ^ t1;
v[14] = BLAKE2B_IV_06 ^ f0;
v[15] = BLAKE2B_IV_07; // ^ f1;
BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3);
BLAKE2B_ROUND_VECTOR (11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4);
BLAKE2B_ROUND_VECTOR ( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8);
BLAKE2B_ROUND_VECTOR ( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13);
BLAKE2B_ROUND_VECTOR ( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9);
BLAKE2B_ROUND_VECTOR (12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11);
BLAKE2B_ROUND_VECTOR (13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10);
BLAKE2B_ROUND_VECTOR ( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5);
BLAKE2B_ROUND_VECTOR (10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0);
BLAKE2B_ROUND_VECTOR ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
BLAKE2B_ROUND_VECTOR (14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3);
h[0] = h[0] ^ v[0] ^ v[ 8];
h[1] = h[1] ^ v[1] ^ v[ 9];
h[2] = h[2] ^ v[2] ^ v[10];
h[3] = h[3] ^ v[3] ^ v[11];
h[4] = h[4] ^ v[4] ^ v[12];
h[5] = h[5] ^ v[5] ^ v[13];
h[6] = h[6] ^ v[6] ^ v[14];
h[7] = h[7] ^ v[7] ^ v[15];
}
DECLSPEC void blake2b_init_vector (blake2b_ctx_vector_t *ctx)
{
ctx->h[0] = BLAKE2B_IV_00 ^ 0x01010040; // default output length: 0x40 = 64 bytes
ctx->h[1] = BLAKE2B_IV_01;
ctx->h[2] = BLAKE2B_IV_02;
ctx->h[3] = BLAKE2B_IV_03;
ctx->h[4] = BLAKE2B_IV_04;
ctx->h[5] = BLAKE2B_IV_05;
ctx->h[6] = BLAKE2B_IV_06;
ctx->h[7] = BLAKE2B_IV_07;
ctx->m[ 0] = 0;
ctx->m[ 1] = 0;
ctx->m[ 2] = 0;
ctx->m[ 3] = 0;
ctx->m[ 4] = 0;
ctx->m[ 5] = 0;
ctx->m[ 6] = 0;
ctx->m[ 7] = 0;
ctx->m[ 8] = 0;
ctx->m[ 9] = 0;
ctx->m[10] = 0;
ctx->m[11] = 0;
ctx->m[12] = 0;
ctx->m[13] = 0;
ctx->m[14] = 0;
ctx->m[15] = 0;
ctx->len = 0;
}
DECLSPEC void blake2b_update_vector_128 (blake2b_ctx_vector_t *ctx, u32x *w0, u32x *w1, u32x *w2, u32x *w3, u32x *w4, u32x *w5, u32x *w6, u32x *w7, const int len)
{
if (len == 0) return;
const int pos = ctx->len & 127;
if (pos == 0)
{
if (ctx->len > 0) // if new block (pos == 0) AND the (old) len is not zero => transform
{
blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_UPDATE);
}
ctx->m[ 0] = hl32_to_64 (w0[1], w0[0]);
ctx->m[ 1] = hl32_to_64 (w0[3], w0[2]);
ctx->m[ 2] = hl32_to_64 (w1[1], w1[0]);
ctx->m[ 3] = hl32_to_64 (w1[3], w1[2]);
ctx->m[ 4] = hl32_to_64 (w2[1], w2[0]);
ctx->m[ 5] = hl32_to_64 (w2[3], w2[2]);
ctx->m[ 6] = hl32_to_64 (w3[1], w3[0]);
ctx->m[ 7] = hl32_to_64 (w3[3], w3[2]);
ctx->m[ 8] = hl32_to_64 (w4[1], w4[0]);
ctx->m[ 9] = hl32_to_64 (w4[3], w4[2]);
ctx->m[10] = hl32_to_64 (w5[1], w5[0]);
ctx->m[11] = hl32_to_64 (w5[3], w5[2]);
ctx->m[12] = hl32_to_64 (w6[1], w6[0]);
ctx->m[13] = hl32_to_64 (w6[3], w6[2]);
ctx->m[14] = hl32_to_64 (w7[1], w7[0]);
ctx->m[15] = hl32_to_64 (w7[3], w7[2]);
}
else
{
if ((pos + len) <= 128)
{
switch_buffer_by_offset_8x4_le (w0, w1, w2, w3, w4, w5, w6, w7, pos);
ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]);
ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]);
ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]);
ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]);
ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]);
ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]);
ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]);
ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]);
ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]);
ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]);
ctx->m[10] |= hl32_to_64 (w5[1], w5[0]);
ctx->m[11] |= hl32_to_64 (w5[3], w5[2]);
ctx->m[12] |= hl32_to_64 (w6[1], w6[0]);
ctx->m[13] |= hl32_to_64 (w6[3], w6[2]);
ctx->m[14] |= hl32_to_64 (w7[1], w7[0]);
ctx->m[15] |= hl32_to_64 (w7[3], w7[2]);
}
else
{
u32x c0[4] = { 0 };
u32x c1[4] = { 0 };
u32x c2[4] = { 0 };
u32x c3[4] = { 0 };
u32x c4[4] = { 0 };
u32x c5[4] = { 0 };
u32x c6[4] = { 0 };
u32x c7[4] = { 0 };
switch_buffer_by_offset_8x4_carry_le (w0, w1, w2, w3, w4, w5, w6, w7, c0, c1, c2, c3, c4, c5, c6, c7, pos);
ctx->m[ 0] |= hl32_to_64 (w0[1], w0[0]);
ctx->m[ 1] |= hl32_to_64 (w0[3], w0[2]);
ctx->m[ 2] |= hl32_to_64 (w1[1], w1[0]);
ctx->m[ 3] |= hl32_to_64 (w1[3], w1[2]);
ctx->m[ 4] |= hl32_to_64 (w2[1], w2[0]);
ctx->m[ 5] |= hl32_to_64 (w2[3], w2[2]);
ctx->m[ 6] |= hl32_to_64 (w3[1], w3[0]);
ctx->m[ 7] |= hl32_to_64 (w3[3], w3[2]);
ctx->m[ 8] |= hl32_to_64 (w4[1], w4[0]);
ctx->m[ 9] |= hl32_to_64 (w4[3], w4[2]);
ctx->m[10] |= hl32_to_64 (w5[1], w5[0]);
ctx->m[11] |= hl32_to_64 (w5[3], w5[2]);
ctx->m[12] |= hl32_to_64 (w6[1], w6[0]);
ctx->m[13] |= hl32_to_64 (w6[3], w6[2]);
ctx->m[14] |= hl32_to_64 (w7[1], w7[0]);
ctx->m[15] |= hl32_to_64 (w7[3], w7[2]);
// len must be a multiple of 128 (not ctx->len) for BLAKE2B_UPDATE:
const u32x cur_len = ((ctx->len + len) / 128) * 128;
blake2b_transform_vector (ctx->h, ctx->m, cur_len, BLAKE2B_UPDATE);
ctx->m[ 0] = hl32_to_64 (c0[1], c0[0]);
ctx->m[ 1] = hl32_to_64 (c0[3], c0[2]);
ctx->m[ 2] = hl32_to_64 (c1[1], c1[0]);
ctx->m[ 3] = hl32_to_64 (c1[3], c1[2]);
ctx->m[ 4] = hl32_to_64 (c2[1], c2[0]);
ctx->m[ 5] = hl32_to_64 (c2[3], c2[2]);
ctx->m[ 6] = hl32_to_64 (c3[1], c3[0]);
ctx->m[ 7] = hl32_to_64 (c3[3], c3[2]);
ctx->m[ 8] = hl32_to_64 (c4[1], c4[0]);
ctx->m[ 9] = hl32_to_64 (c4[3], c4[2]);
ctx->m[10] = hl32_to_64 (c5[1], c5[0]);
ctx->m[11] = hl32_to_64 (c5[3], c5[2]);
ctx->m[12] = hl32_to_64 (c6[1], c6[0]);
ctx->m[13] = hl32_to_64 (c6[3], c6[2]);
ctx->m[14] = hl32_to_64 (c7[1], c7[0]);
ctx->m[15] = hl32_to_64 (c7[3], c7[2]);
}
}
ctx->len += len;
}
DECLSPEC void blake2b_update_vector (blake2b_ctx_vector_t *ctx, const u32x *w, const int len)
{
u32x w0[4];
u32x w1[4];
u32x w2[4];
u32x w3[4];
u32x w4[4];
u32x w5[4];
u32x w6[4];
u32x w7[4];
const int limit = (const int) len - 128; // int type needed, could be negative
int pos1;
int pos4;
for (pos1 = 0, pos4 = 0; pos1 < limit; pos1 += 128, pos4 += 32)
{
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128);
}
w0[0] = w[pos4 + 0];
w0[1] = w[pos4 + 1];
w0[2] = w[pos4 + 2];
w0[3] = w[pos4 + 3];
w1[0] = w[pos4 + 4];
w1[1] = w[pos4 + 5];
w1[2] = w[pos4 + 6];
w1[3] = w[pos4 + 7];
w2[0] = w[pos4 + 8];
w2[1] = w[pos4 + 9];
w2[2] = w[pos4 + 10];
w2[3] = w[pos4 + 11];
w3[0] = w[pos4 + 12];
w3[1] = w[pos4 + 13];
w3[2] = w[pos4 + 14];
w3[3] = w[pos4 + 15];
w4[0] = w[pos4 + 16];
w4[1] = w[pos4 + 17];
w4[2] = w[pos4 + 18];
w4[3] = w[pos4 + 19];
w5[0] = w[pos4 + 20];
w5[1] = w[pos4 + 21];
w5[2] = w[pos4 + 22];
w5[3] = w[pos4 + 23];
w6[0] = w[pos4 + 24];
w6[1] = w[pos4 + 25];
w6[2] = w[pos4 + 26];
w6[3] = w[pos4 + 27];
w7[0] = w[pos4 + 28];
w7[1] = w[pos4 + 29];
w7[2] = w[pos4 + 30];
w7[3] = w[pos4 + 31];
blake2b_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - (u32) pos1);
}
DECLSPEC void blake2b_final_vector (blake2b_ctx_vector_t *ctx)
{
blake2b_transform_vector (ctx->h, ctx->m, (u32x) ctx->len, BLAKE2B_FINAL);
}