Rename some hashcat specific OpenCL functions to avoid conflicts with existing OpenCL functions from OpenCL runtime

pull/1629/head
Jens Steube 6 years ago
parent 02a2495349
commit a43d3ad176

File diff suppressed because it is too large Load Diff

@ -34,20 +34,20 @@
#define MD4_STEP_S(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a = __add3_S (a, x, f (b, c, d)); \
a = hc_add3_S (a, x, f (b, c, d)); \
a = rotl32_S (a, s); \
}
#define MD4_STEP(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a = __add3 (a, x, f (b, c, d)); \
a = hc_add3 (a, x, f (b, c, d)); \
a = rotl32 (a, s); \
}
#define MD4_STEP0(f,a,b,c,d,K,s) \
{ \
a = __add3 (a, K, f (b, c, d)); \
a = hc_add3 (a, K, f (b, c, d)); \
a = rotl32 (a, s); \
}
@ -92,7 +92,7 @@
#define MD5_STEP_S(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a = __add3_S (a, x, f (b, c, d)); \
a = hc_add3_S (a, x, f (b, c, d)); \
a = rotl32_S (a, s); \
a += b; \
}
@ -100,14 +100,14 @@
#define MD5_STEP(f,a,b,c,d,x,K,s) \
{ \
a += K; \
a = __add3 (a, x, f (b, c, d)); \
a = hc_add3 (a, x, f (b, c, d)); \
a = rotl32 (a, s); \
a += b; \
}
#define MD5_STEP0(f,a,b,c,d,K,s) \
{ \
a = __add3 (a, K, f (b, c, d)); \
a = hc_add3 (a, K, f (b, c, d)); \
a = rotl32 (a, s); \
a += b; \
}
@ -139,7 +139,7 @@
#define SHA1_STEP_S(f,a,b,c,d,e,x) \
{ \
e += K; \
e = __add3_S (e, x, f (b, c, d)); \
e = hc_add3_S (e, x, f (b, c, d)); \
e += rotl32_S (a, 5u); \
b = rotl32_S (b, 30u); \
}
@ -147,7 +147,7 @@
#define SHA1_STEP(f,a,b,c,d,e,x) \
{ \
e += K; \
e = __add3 (e, x, f (b, c, d)); \
e = hc_add3 (e, x, f (b, c, d)); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
@ -155,7 +155,7 @@
/*
#define SHA1_STEP0(f,a,b,c,d,e,x) \
{ \
e = __add3 (e, K, f (b, c, d)); \
e = hc_add3 (e, K, f (b, c, d)); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
@ -163,7 +163,7 @@
#define SHA1_STEPX(f,a,b,c,d,e,x) \
{ \
e = __add3 (e, x, f (b, c, d)); \
e = hc_add3 (e, x, f (b, c, d)); \
e += rotl32 (a, 5u); \
b = rotl32 (b, 30u); \
}
@ -203,20 +203,20 @@
#define SHA256_STEP_S(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
h = __add3_S (h, K, x); \
h = __add3_S (h, SHA256_S3_S (e), F1 (e,f,g)); \
h = hc_add3_S (h, K, x); \
h = hc_add3_S (h, SHA256_S3_S (e), F1 (e,f,g)); \
d += h; \
h = __add3_S (h, SHA256_S2_S (a), F0 (a,b,c)); \
h = hc_add3_S (h, SHA256_S2_S (a), F0 (a,b,c)); \
}
#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w)
#define SHA256_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \
{ \
h = __add3 (h, K, x); \
h = __add3 (h, SHA256_S3 (e), F1 (e,f,g)); \
h = hc_add3 (h, K, x); \
h = hc_add3 (h, SHA256_S3 (e), F1 (e,f,g)); \
d += h; \
h = __add3 (h, SHA256_S2 (a), F0 (a,b,c)); \
h = hc_add3 (h, SHA256_S2 (a), F0 (a,b,c)); \
}
#define SHA256_EXPAND(x,y,z,w) (SHA256_S1 (x) + y + SHA256_S0 (z) + w)

@ -136,26 +136,26 @@ void truncate_left (u32 *buf0, u32 *buf1, const u32 offset)
void lshift_block (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1)
{
out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
out1[3] = amd_bytealign_S ( 0, in1[3], 1);
out0[0] = hc_bytealign_S (in0[1], in0[0], 1);
out0[1] = hc_bytealign_S (in0[2], in0[1], 1);
out0[2] = hc_bytealign_S (in0[3], in0[2], 1);
out0[3] = hc_bytealign_S (in1[0], in0[3], 1);
out1[0] = hc_bytealign_S (in1[1], in1[0], 1);
out1[1] = hc_bytealign_S (in1[2], in1[1], 1);
out1[2] = hc_bytealign_S (in1[3], in1[2], 1);
out1[3] = hc_bytealign_S ( 0, in1[3], 1);
}
void rshift_block (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1)
{
out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
out0[0] = amd_bytealign_S (in0[0], 0, 3);
out1[3] = hc_bytealign_S (in1[3], in1[2], 3);
out1[2] = hc_bytealign_S (in1[2], in1[1], 3);
out1[1] = hc_bytealign_S (in1[1], in1[0], 3);
out1[0] = hc_bytealign_S (in1[0], in0[3], 3);
out0[3] = hc_bytealign_S (in0[3], in0[2], 3);
out0[2] = hc_bytealign_S (in0[2], in0[1], 3);
out0[1] = hc_bytealign_S (in0[1], in0[0], 3);
out0[0] = hc_bytealign_S (in0[0], 0, 3);
}
void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const u32 num)
@ -171,32 +171,32 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = in1[2];
out1[3] = in1[3];
break;
case 1: out0[0] = amd_bytealign_S (in0[1], in0[0], 1);
out0[1] = amd_bytealign_S (in0[2], in0[1], 1);
out0[2] = amd_bytealign_S (in0[3], in0[2], 1);
out0[3] = amd_bytealign_S (in1[0], in0[3], 1);
out1[0] = amd_bytealign_S (in1[1], in1[0], 1);
out1[1] = amd_bytealign_S (in1[2], in1[1], 1);
out1[2] = amd_bytealign_S (in1[3], in1[2], 1);
out1[3] = amd_bytealign_S ( 0, in1[3], 1);
break;
case 2: out0[0] = amd_bytealign_S (in0[1], in0[0], 2);
out0[1] = amd_bytealign_S (in0[2], in0[1], 2);
out0[2] = amd_bytealign_S (in0[3], in0[2], 2);
out0[3] = amd_bytealign_S (in1[0], in0[3], 2);
out1[0] = amd_bytealign_S (in1[1], in1[0], 2);
out1[1] = amd_bytealign_S (in1[2], in1[1], 2);
out1[2] = amd_bytealign_S (in1[3], in1[2], 2);
out1[3] = amd_bytealign_S ( 0, in1[3], 2);
break;
case 3: out0[0] = amd_bytealign_S (in0[1], in0[0], 3);
out0[1] = amd_bytealign_S (in0[2], in0[1], 3);
out0[2] = amd_bytealign_S (in0[3], in0[2], 3);
out0[3] = amd_bytealign_S (in1[0], in0[3], 3);
out1[0] = amd_bytealign_S (in1[1], in1[0], 3);
out1[1] = amd_bytealign_S (in1[2], in1[1], 3);
out1[2] = amd_bytealign_S (in1[3], in1[2], 3);
out1[3] = amd_bytealign_S ( 0, in1[3], 3);
case 1: out0[0] = hc_bytealign_S (in0[1], in0[0], 1);
out0[1] = hc_bytealign_S (in0[2], in0[1], 1);
out0[2] = hc_bytealign_S (in0[3], in0[2], 1);
out0[3] = hc_bytealign_S (in1[0], in0[3], 1);
out1[0] = hc_bytealign_S (in1[1], in1[0], 1);
out1[1] = hc_bytealign_S (in1[2], in1[1], 1);
out1[2] = hc_bytealign_S (in1[3], in1[2], 1);
out1[3] = hc_bytealign_S ( 0, in1[3], 1);
break;
case 2: out0[0] = hc_bytealign_S (in0[1], in0[0], 2);
out0[1] = hc_bytealign_S (in0[2], in0[1], 2);
out0[2] = hc_bytealign_S (in0[3], in0[2], 2);
out0[3] = hc_bytealign_S (in1[0], in0[3], 2);
out1[0] = hc_bytealign_S (in1[1], in1[0], 2);
out1[1] = hc_bytealign_S (in1[2], in1[1], 2);
out1[2] = hc_bytealign_S (in1[3], in1[2], 2);
out1[3] = hc_bytealign_S ( 0, in1[3], 2);
break;
case 3: out0[0] = hc_bytealign_S (in0[1], in0[0], 3);
out0[1] = hc_bytealign_S (in0[2], in0[1], 3);
out0[2] = hc_bytealign_S (in0[3], in0[2], 3);
out0[3] = hc_bytealign_S (in1[0], in0[3], 3);
out1[0] = hc_bytealign_S (in1[1], in1[0], 3);
out1[1] = hc_bytealign_S (in1[2], in1[1], 3);
out1[2] = hc_bytealign_S (in1[3], in1[2], 3);
out1[3] = hc_bytealign_S ( 0, in1[3], 3);
break;
case 4: out0[0] = in0[1];
out0[1] = in0[2];
@ -207,31 +207,31 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = in1[3];
out1[3] = 0;
break;
case 5: out0[0] = amd_bytealign_S (in0[2], in0[1], 1);
out0[1] = amd_bytealign_S (in0[3], in0[2], 1);
out0[2] = amd_bytealign_S (in1[0], in0[3], 1);
out0[3] = amd_bytealign_S (in1[1], in1[0], 1);
out1[0] = amd_bytealign_S (in1[2], in1[1], 1);
out1[1] = amd_bytealign_S (in1[3], in1[2], 1);
out1[2] = amd_bytealign_S ( 0, in1[3], 1);
case 5: out0[0] = hc_bytealign_S (in0[2], in0[1], 1);
out0[1] = hc_bytealign_S (in0[3], in0[2], 1);
out0[2] = hc_bytealign_S (in1[0], in0[3], 1);
out0[3] = hc_bytealign_S (in1[1], in1[0], 1);
out1[0] = hc_bytealign_S (in1[2], in1[1], 1);
out1[1] = hc_bytealign_S (in1[3], in1[2], 1);
out1[2] = hc_bytealign_S ( 0, in1[3], 1);
out1[3] = 0;
break;
case 6: out0[0] = amd_bytealign_S (in0[2], in0[1], 2);
out0[1] = amd_bytealign_S (in0[3], in0[2], 2);
out0[2] = amd_bytealign_S (in1[0], in0[3], 2);
out0[3] = amd_bytealign_S (in1[1], in1[0], 2);
out1[0] = amd_bytealign_S (in1[2], in1[1], 2);
out1[1] = amd_bytealign_S (in1[3], in1[2], 2);
out1[2] = amd_bytealign_S ( 0, in1[3], 2);
case 6: out0[0] = hc_bytealign_S (in0[2], in0[1], 2);
out0[1] = hc_bytealign_S (in0[3], in0[2], 2);
out0[2] = hc_bytealign_S (in1[0], in0[3], 2);
out0[3] = hc_bytealign_S (in1[1], in1[0], 2);
out1[0] = hc_bytealign_S (in1[2], in1[1], 2);
out1[1] = hc_bytealign_S (in1[3], in1[2], 2);
out1[2] = hc_bytealign_S ( 0, in1[3], 2);
out1[3] = 0;
break;
case 7: out0[0] = amd_bytealign_S (in0[2], in0[1], 3);
out0[1] = amd_bytealign_S (in0[3], in0[2], 3);
out0[2] = amd_bytealign_S (in1[0], in0[3], 3);
out0[3] = amd_bytealign_S (in1[1], in1[0], 3);
out1[0] = amd_bytealign_S (in1[2], in1[1], 3);
out1[1] = amd_bytealign_S (in1[3], in1[2], 3);
out1[2] = amd_bytealign_S ( 0, in1[3], 3);
case 7: out0[0] = hc_bytealign_S (in0[2], in0[1], 3);
out0[1] = hc_bytealign_S (in0[3], in0[2], 3);
out0[2] = hc_bytealign_S (in1[0], in0[3], 3);
out0[3] = hc_bytealign_S (in1[1], in1[0], 3);
out1[0] = hc_bytealign_S (in1[2], in1[1], 3);
out1[1] = hc_bytealign_S (in1[3], in1[2], 3);
out1[2] = hc_bytealign_S ( 0, in1[3], 3);
out1[3] = 0;
break;
case 8: out0[0] = in0[2];
@ -243,30 +243,30 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 9: out0[0] = amd_bytealign_S (in0[3], in0[2], 1);
out0[1] = amd_bytealign_S (in1[0], in0[3], 1);
out0[2] = amd_bytealign_S (in1[1], in1[0], 1);
out0[3] = amd_bytealign_S (in1[2], in1[1], 1);
out1[0] = amd_bytealign_S (in1[3], in1[2], 1);
out1[1] = amd_bytealign_S ( 0, in1[3], 1);
case 9: out0[0] = hc_bytealign_S (in0[3], in0[2], 1);
out0[1] = hc_bytealign_S (in1[0], in0[3], 1);
out0[2] = hc_bytealign_S (in1[1], in1[0], 1);
out0[3] = hc_bytealign_S (in1[2], in1[1], 1);
out1[0] = hc_bytealign_S (in1[3], in1[2], 1);
out1[1] = hc_bytealign_S ( 0, in1[3], 1);
out1[2] = 0;
out1[3] = 0;
break;
case 10: out0[0] = amd_bytealign_S (in0[3], in0[2], 2);
out0[1] = amd_bytealign_S (in1[0], in0[3], 2);
out0[2] = amd_bytealign_S (in1[1], in1[0], 2);
out0[3] = amd_bytealign_S (in1[2], in1[1], 2);
out1[0] = amd_bytealign_S (in1[3], in1[2], 2);
out1[1] = amd_bytealign_S ( 0, in1[3], 2);
case 10: out0[0] = hc_bytealign_S (in0[3], in0[2], 2);
out0[1] = hc_bytealign_S (in1[0], in0[3], 2);
out0[2] = hc_bytealign_S (in1[1], in1[0], 2);
out0[3] = hc_bytealign_S (in1[2], in1[1], 2);
out1[0] = hc_bytealign_S (in1[3], in1[2], 2);
out1[1] = hc_bytealign_S ( 0, in1[3], 2);
out1[2] = 0;
out1[3] = 0;
break;
case 11: out0[0] = amd_bytealign_S (in0[3], in0[2], 3);
out0[1] = amd_bytealign_S (in1[0], in0[3], 3);
out0[2] = amd_bytealign_S (in1[1], in1[0], 3);
out0[3] = amd_bytealign_S (in1[2], in1[1], 3);
out1[0] = amd_bytealign_S (in1[3], in1[2], 3);
out1[1] = amd_bytealign_S ( 0, in1[3], 3);
case 11: out0[0] = hc_bytealign_S (in0[3], in0[2], 3);
out0[1] = hc_bytealign_S (in1[0], in0[3], 3);
out0[2] = hc_bytealign_S (in1[1], in1[0], 3);
out0[3] = hc_bytealign_S (in1[2], in1[1], 3);
out1[0] = hc_bytealign_S (in1[3], in1[2], 3);
out1[1] = hc_bytealign_S ( 0, in1[3], 3);
out1[2] = 0;
out1[3] = 0;
break;
@ -279,29 +279,29 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 13: out0[0] = amd_bytealign_S (in1[0], in0[3], 1);
out0[1] = amd_bytealign_S (in1[1], in1[0], 1);
out0[2] = amd_bytealign_S (in1[2], in1[1], 1);
out0[3] = amd_bytealign_S (in1[3], in1[2], 1);
out1[0] = amd_bytealign_S ( 0, in1[3], 1);
case 13: out0[0] = hc_bytealign_S (in1[0], in0[3], 1);
out0[1] = hc_bytealign_S (in1[1], in1[0], 1);
out0[2] = hc_bytealign_S (in1[2], in1[1], 1);
out0[3] = hc_bytealign_S (in1[3], in1[2], 1);
out1[0] = hc_bytealign_S ( 0, in1[3], 1);
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 14: out0[0] = amd_bytealign_S (in1[0], in0[3], 2);
out0[1] = amd_bytealign_S (in1[1], in1[0], 2);
out0[2] = amd_bytealign_S (in1[2], in1[1], 2);
out0[3] = amd_bytealign_S (in1[3], in1[2], 2);
out1[0] = amd_bytealign_S ( 0, in1[3], 2);
case 14: out0[0] = hc_bytealign_S (in1[0], in0[3], 2);
out0[1] = hc_bytealign_S (in1[1], in1[0], 2);
out0[2] = hc_bytealign_S (in1[2], in1[1], 2);
out0[3] = hc_bytealign_S (in1[3], in1[2], 2);
out1[0] = hc_bytealign_S ( 0, in1[3], 2);
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 15: out0[0] = amd_bytealign_S (in1[0], in0[3], 3);
out0[1] = amd_bytealign_S (in1[1], in1[0], 3);
out0[2] = amd_bytealign_S (in1[2], in1[1], 3);
out0[3] = amd_bytealign_S (in1[3], in1[2], 3);
out1[0] = amd_bytealign_S ( 0, in1[3], 3);
case 15: out0[0] = hc_bytealign_S (in1[0], in0[3], 3);
out0[1] = hc_bytealign_S (in1[1], in1[0], 3);
out0[2] = hc_bytealign_S (in1[2], in1[1], 3);
out0[3] = hc_bytealign_S (in1[3], in1[2], 3);
out1[0] = hc_bytealign_S ( 0, in1[3], 3);
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
@ -315,28 +315,28 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 17: out0[0] = amd_bytealign_S (in1[1], in1[0], 1);
out0[1] = amd_bytealign_S (in1[2], in1[1], 1);
out0[2] = amd_bytealign_S (in1[3], in1[2], 1);
out0[3] = amd_bytealign_S ( 0, in1[3], 1);
case 17: out0[0] = hc_bytealign_S (in1[1], in1[0], 1);
out0[1] = hc_bytealign_S (in1[2], in1[1], 1);
out0[2] = hc_bytealign_S (in1[3], in1[2], 1);
out0[3] = hc_bytealign_S ( 0, in1[3], 1);
out1[0] = 0;
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 18: out0[0] = amd_bytealign_S (in1[1], in1[0], 2);
out0[1] = amd_bytealign_S (in1[2], in1[1], 2);
out0[2] = amd_bytealign_S (in1[3], in1[2], 2);
out0[3] = amd_bytealign_S ( 0, in1[3], 2);
case 18: out0[0] = hc_bytealign_S (in1[1], in1[0], 2);
out0[1] = hc_bytealign_S (in1[2], in1[1], 2);
out0[2] = hc_bytealign_S (in1[3], in1[2], 2);
out0[3] = hc_bytealign_S ( 0, in1[3], 2);
out1[0] = 0;
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 19: out0[0] = amd_bytealign_S (in1[1], in1[0], 3);
out0[1] = amd_bytealign_S (in1[2], in1[1], 3);
out0[2] = amd_bytealign_S (in1[3], in1[2], 3);
out0[3] = amd_bytealign_S ( 0, in1[3], 3);
case 19: out0[0] = hc_bytealign_S (in1[1], in1[0], 3);
out0[1] = hc_bytealign_S (in1[2], in1[1], 3);
out0[2] = hc_bytealign_S (in1[3], in1[2], 3);
out0[3] = hc_bytealign_S ( 0, in1[3], 3);
out1[0] = 0;
out1[1] = 0;
out1[2] = 0;
@ -351,27 +351,27 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 21: out0[0] = amd_bytealign_S (in1[2], in1[1], 1);
out0[1] = amd_bytealign_S (in1[3], in1[2], 1);
out0[2] = amd_bytealign_S ( 0, in1[3], 1);
case 21: out0[0] = hc_bytealign_S (in1[2], in1[1], 1);
out0[1] = hc_bytealign_S (in1[3], in1[2], 1);
out0[2] = hc_bytealign_S ( 0, in1[3], 1);
out0[3] = 0;
out1[0] = 0;
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 22: out0[0] = amd_bytealign_S (in1[2], in1[1], 2);
out0[1] = amd_bytealign_S (in1[3], in1[2], 2);
out0[2] = amd_bytealign_S ( 0, in1[3], 2);
case 22: out0[0] = hc_bytealign_S (in1[2], in1[1], 2);
out0[1] = hc_bytealign_S (in1[3], in1[2], 2);
out0[2] = hc_bytealign_S ( 0, in1[3], 2);
out0[3] = 0;
out1[0] = 0;
out1[1] = 0;
out1[2] = 0;
out1[3] = 0;
break;
case 23: out0[0] = amd_bytealign_S (in1[2], in1[1], 3);
out0[1] = amd_bytealign_S (in1[3], in1[2], 3);
out0[2] = amd_bytealign_S ( 0, in1[3], 3);
case 23: out0[0] = hc_bytealign_S (in1[2], in1[1], 3);
out0[1] = hc_bytealign_S (in1[3], in1[2], 3);
out0[2] = hc_bytealign_S ( 0, in1[3], 3);
out0[3] = 0;
out1[0] = 0;
out1[1] = 0;
@ -387,8 +387,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 25: out0[0] = amd_bytealign_S (in1[3], in1[2], 1);
out0[1] = amd_bytealign_S ( 0, in1[3], 1);
case 25: out0[0] = hc_bytealign_S (in1[3], in1[2], 1);
out0[1] = hc_bytealign_S ( 0, in1[3], 1);
out0[2] = 0;
out0[3] = 0;
out1[0] = 0;
@ -396,8 +396,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 26: out0[0] = amd_bytealign_S (in1[3], in1[2], 2);
out0[1] = amd_bytealign_S ( 0, in1[3], 2);
case 26: out0[0] = hc_bytealign_S (in1[3], in1[2], 2);
out0[1] = hc_bytealign_S ( 0, in1[3], 2);
out0[2] = 0;
out0[3] = 0;
out1[0] = 0;
@ -405,8 +405,8 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 27: out0[0] = amd_bytealign_S (in1[3], in1[2], 3);
out0[1] = amd_bytealign_S ( 0, in1[3], 3);
case 27: out0[0] = hc_bytealign_S (in1[3], in1[2], 3);
out0[1] = hc_bytealign_S ( 0, in1[3], 3);
out0[2] = 0;
out0[3] = 0;
out1[0] = 0;
@ -423,7 +423,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 29: out0[0] = amd_bytealign_S ( 0, in1[3], 1);
case 29: out0[0] = hc_bytealign_S ( 0, in1[3], 1);
out0[1] = 0;
out0[2] = 0;
out0[3] = 0;
@ -432,7 +432,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 30: out0[0] = amd_bytealign_S ( 0, in1[3], 2);
case 30: out0[0] = hc_bytealign_S ( 0, in1[3], 2);
out0[1] = 0;
out0[2] = 0;
out0[3] = 0;
@ -441,7 +441,7 @@ void lshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out1[2] = 0;
out1[3] = 0;
break;
case 31: out0[0] = amd_bytealign_S ( 0, in1[3], 3);
case 31: out0[0] = hc_bytealign_S ( 0, in1[3], 3);
out0[1] = 0;
out0[2] = 0;
out0[3] = 0;
@ -466,32 +466,32 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = in0[1];
out0[0] = in0[0];
break;
case 1: out1[3] = amd_bytealign_S (in1[3], in1[2], 3);
out1[2] = amd_bytealign_S (in1[2], in1[1], 3);
out1[1] = amd_bytealign_S (in1[1], in1[0], 3);
out1[0] = amd_bytealign_S (in1[0], in0[3], 3);
out0[3] = amd_bytealign_S (in0[3], in0[2], 3);
out0[2] = amd_bytealign_S (in0[2], in0[1], 3);
out0[1] = amd_bytealign_S (in0[1], in0[0], 3);
out0[0] = amd_bytealign_S (in0[0], 0, 3);
break;
case 2: out1[3] = amd_bytealign_S (in1[3], in1[2], 2);
out1[2] = amd_bytealign_S (in1[2], in1[1], 2);
out1[1] = amd_bytealign_S (in1[1], in1[0], 2);
out1[0] = amd_bytealign_S (in1[0], in0[3], 2);
out0[3] = amd_bytealign_S (in0[3], in0[2], 2);
out0[2] = amd_bytealign_S (in0[2], in0[1], 2);
out0[1] = amd_bytealign_S (in0[1], in0[0], 2);
out0[0] = amd_bytealign_S (in0[0], 0, 2);
break;
case 3: out1[3] = amd_bytealign_S (in1[3], in1[2], 1);
out1[2] = amd_bytealign_S (in1[2], in1[1], 1);
out1[1] = amd_bytealign_S (in1[1], in1[0], 1);
out1[0] = amd_bytealign_S (in1[0], in0[3], 1);
out0[3] = amd_bytealign_S (in0[3], in0[2], 1);
out0[2] = amd_bytealign_S (in0[2], in0[1], 1);
out0[1] = amd_bytealign_S (in0[1], in0[0], 1);
out0[0] = amd_bytealign_S (in0[0], 0, 1);
case 1: out1[3] = hc_bytealign_S (in1[3], in1[2], 3);
out1[2] = hc_bytealign_S (in1[2], in1[1], 3);
out1[1] = hc_bytealign_S (in1[1], in1[0], 3);
out1[0] = hc_bytealign_S (in1[0], in0[3], 3);
out0[3] = hc_bytealign_S (in0[3], in0[2], 3);
out0[2] = hc_bytealign_S (in0[2], in0[1], 3);
out0[1] = hc_bytealign_S (in0[1], in0[0], 3);
out0[0] = hc_bytealign_S (in0[0], 0, 3);
break;
case 2: out1[3] = hc_bytealign_S (in1[3], in1[2], 2);
out1[2] = hc_bytealign_S (in1[2], in1[1], 2);
out1[1] = hc_bytealign_S (in1[1], in1[0], 2);
out1[0] = hc_bytealign_S (in1[0], in0[3], 2);
out0[3] = hc_bytealign_S (in0[3], in0[2], 2);
out0[2] = hc_bytealign_S (in0[2], in0[1], 2);
out0[1] = hc_bytealign_S (in0[1], in0[0], 2);
out0[0] = hc_bytealign_S (in0[0], 0, 2);
break;
case 3: out1[3] = hc_bytealign_S (in1[3], in1[2], 1);
out1[2] = hc_bytealign_S (in1[2], in1[1], 1);
out1[1] = hc_bytealign_S (in1[1], in1[0], 1);
out1[0] = hc_bytealign_S (in1[0], in0[3], 1);
out0[3] = hc_bytealign_S (in0[3], in0[2], 1);
out0[2] = hc_bytealign_S (in0[2], in0[1], 1);
out0[1] = hc_bytealign_S (in0[1], in0[0], 1);
out0[0] = hc_bytealign_S (in0[0], 0, 1);
break;
case 4: out1[3] = in1[2];
out1[2] = in1[1];
@ -502,31 +502,31 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = in0[0];
out0[0] = 0;
break;
case 5: out1[3] = amd_bytealign_S (in1[2], in1[1], 3);
out1[2] = amd_bytealign_S (in1[1], in1[0], 3);
out1[1] = amd_bytealign_S (in1[0], in0[3], 3);
out1[0] = amd_bytealign_S (in0[3], in0[2], 3);
out0[3] = amd_bytealign_S (in0[2], in0[1], 3);
out0[2] = amd_bytealign_S (in0[1], in0[0], 3);
out0[1] = amd_bytealign_S (in0[0], 0, 3);
case 5: out1[3] = hc_bytealign_S (in1[2], in1[1], 3);
out1[2] = hc_bytealign_S (in1[1], in1[0], 3);
out1[1] = hc_bytealign_S (in1[0], in0[3], 3);
out1[0] = hc_bytealign_S (in0[3], in0[2], 3);
out0[3] = hc_bytealign_S (in0[2], in0[1], 3);
out0[2] = hc_bytealign_S (in0[1], in0[0], 3);
out0[1] = hc_bytealign_S (in0[0], 0, 3);
out0[0] = 0;
break;
case 6: out1[3] = amd_bytealign_S (in1[2], in1[1], 2);
out1[2] = amd_bytealign_S (in1[1], in1[0], 2);
out1[1] = amd_bytealign_S (in1[0], in0[3], 2);
out1[0] = amd_bytealign_S (in0[3], in0[2], 2);
out0[3] = amd_bytealign_S (in0[2], in0[1], 2);
out0[2] = amd_bytealign_S (in0[1], in0[0], 2);
out0[1] = amd_bytealign_S (in0[0], 0, 2);
case 6: out1[3] = hc_bytealign_S (in1[2], in1[1], 2);
out1[2] = hc_bytealign_S (in1[1], in1[0], 2);
out1[1] = hc_bytealign_S (in1[0], in0[3], 2);
out1[0] = hc_bytealign_S (in0[3], in0[2], 2);
out0[3] = hc_bytealign_S (in0[2], in0[1], 2);
out0[2] = hc_bytealign_S (in0[1], in0[0], 2);
out0[1] = hc_bytealign_S (in0[0], 0, 2);
out0[0] = 0;
break;
case 7: out1[3] = amd_bytealign_S (in1[2], in1[1], 1);
out1[2] = amd_bytealign_S (in1[1], in1[0], 1);
out1[1] = amd_bytealign_S (in1[0], in0[3], 1);
out1[0] = amd_bytealign_S (in0[3], in0[2], 1);
out0[3] = amd_bytealign_S (in0[2], in0[1], 1);
out0[2] = amd_bytealign_S (in0[1], in0[0], 1);
out0[1] = amd_bytealign_S (in0[0], 0, 1);
case 7: out1[3] = hc_bytealign_S (in1[2], in1[1], 1);
out1[2] = hc_bytealign_S (in1[1], in1[0], 1);
out1[1] = hc_bytealign_S (in1[0], in0[3], 1);
out1[0] = hc_bytealign_S (in0[3], in0[2], 1);
out0[3] = hc_bytealign_S (in0[2], in0[1], 1);
out0[2] = hc_bytealign_S (in0[1], in0[0], 1);
out0[1] = hc_bytealign_S (in0[0], 0, 1);
out0[0] = 0;
break;
case 8: out1[3] = in1[1];
@ -538,30 +538,30 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 9: out1[3] = amd_bytealign_S (in1[1], in1[0], 3);
out1[2] = amd_bytealign_S (in1[0], in0[3], 3);
out1[1] = amd_bytealign_S (in0[3], in0[2], 3);
out1[0] = amd_bytealign_S (in0[2], in0[1], 3);
out0[3] = amd_bytealign_S (in0[1], in0[0], 3);
out0[2] = amd_bytealign_S (in0[0], 0, 3);
case 9: out1[3] = hc_bytealign_S (in1[1], in1[0], 3);
out1[2] = hc_bytealign_S (in1[0], in0[3], 3);
out1[1] = hc_bytealign_S (in0[3], in0[2], 3);
out1[0] = hc_bytealign_S (in0[2], in0[1], 3);
out0[3] = hc_bytealign_S (in0[1], in0[0], 3);
out0[2] = hc_bytealign_S (in0[0], 0, 3);
out0[1] = 0;
out0[0] = 0;
break;
case 10: out1[3] = amd_bytealign_S (in1[1], in1[0], 2);
out1[2] = amd_bytealign_S (in1[0], in0[3], 2);
out1[1] = amd_bytealign_S (in0[3], in0[2], 2);
out1[0] = amd_bytealign_S (in0[2], in0[1], 2);
out0[3] = amd_bytealign_S (in0[1], in0[0], 2);
out0[2] = amd_bytealign_S (in0[0], 0, 2);
case 10: out1[3] = hc_bytealign_S (in1[1], in1[0], 2);
out1[2] = hc_bytealign_S (in1[0], in0[3], 2);
out1[1] = hc_bytealign_S (in0[3], in0[2], 2);
out1[0] = hc_bytealign_S (in0[2], in0[1], 2);
out0[3] = hc_bytealign_S (in0[1], in0[0], 2);
out0[2] = hc_bytealign_S (in0[0], 0, 2);
out0[1] = 0;
out0[0] = 0;
break;
case 11: out1[3] = amd_bytealign_S (in1[1], in1[0], 1);
out1[2] = amd_bytealign_S (in1[0], in0[3], 1);
out1[1] = amd_bytealign_S (in0[3], in0[2], 1);
out1[0] = amd_bytealign_S (in0[2], in0[1], 1);
out0[3] = amd_bytealign_S (in0[1], in0[0], 1);
out0[2] = amd_bytealign_S (in0[0], 0, 1);
case 11: out1[3] = hc_bytealign_S (in1[1], in1[0], 1);
out1[2] = hc_bytealign_S (in1[0], in0[3], 1);
out1[1] = hc_bytealign_S (in0[3], in0[2], 1);
out1[0] = hc_bytealign_S (in0[2], in0[1], 1);
out0[3] = hc_bytealign_S (in0[1], in0[0], 1);
out0[2] = hc_bytealign_S (in0[0], 0, 1);
out0[1] = 0;
out0[0] = 0;
break;
@ -574,29 +574,29 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 13: out1[3] = amd_bytealign_S (in1[0], in0[3], 3);
out1[2] = amd_bytealign_S (in0[3], in0[2], 3);
out1[1] = amd_bytealign_S (in0[2], in0[1], 3);
out1[0] = amd_bytealign_S (in0[1], in0[0], 3);
out0[3] = amd_bytealign_S (in0[0], 0, 3);
case 13: out1[3] = hc_bytealign_S (in1[0], in0[3], 3);
out1[2] = hc_bytealign_S (in0[3], in0[2], 3);
out1[1] = hc_bytealign_S (in0[2], in0[1], 3);
out1[0] = hc_bytealign_S (in0[1], in0[0], 3);
out0[3] = hc_bytealign_S (in0[0], 0, 3);
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 14: out1[3] = amd_bytealign_S (in1[0], in0[3], 2);
out1[2] = amd_bytealign_S (in0[3], in0[2], 2);
out1[1] = amd_bytealign_S (in0[2], in0[1], 2);
out1[0] = amd_bytealign_S (in0[1], in0[0], 2);
out0[3] = amd_bytealign_S (in0[0], 0, 2);
case 14: out1[3] = hc_bytealign_S (in1[0], in0[3], 2);
out1[2] = hc_bytealign_S (in0[3], in0[2], 2);
out1[1] = hc_bytealign_S (in0[2], in0[1], 2);
out1[0] = hc_bytealign_S (in0[1], in0[0], 2);
out0[3] = hc_bytealign_S (in0[0], 0, 2);
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 15: out1[3] = amd_bytealign_S (in1[0], in0[3], 1);
out1[2] = amd_bytealign_S (in0[3], in0[2], 1);
out1[1] = amd_bytealign_S (in0[2], in0[1], 1);
out1[0] = amd_bytealign_S (in0[1], in0[0], 1);
out0[3] = amd_bytealign_S (in0[0], 0, 1);
case 15: out1[3] = hc_bytealign_S (in1[0], in0[3], 1);
out1[2] = hc_bytealign_S (in0[3], in0[2], 1);
out1[1] = hc_bytealign_S (in0[2], in0[1], 1);
out1[0] = hc_bytealign_S (in0[1], in0[0], 1);
out0[3] = hc_bytealign_S (in0[0], 0, 1);
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
@ -610,28 +610,28 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 17: out1[3] = amd_bytealign_S (in0[3], in0[2], 3);
out1[2] = amd_bytealign_S (in0[2], in0[1], 3);
out1[1] = amd_bytealign_S (in0[1], in0[0], 3);
out1[0] = amd_bytealign_S (in0[0], 0, 3);
case 17: out1[3] = hc_bytealign_S (in0[3], in0[2], 3);
out1[2] = hc_bytealign_S (in0[2], in0[1], 3);
out1[1] = hc_bytealign_S (in0[1], in0[0], 3);
out1[0] = hc_bytealign_S (in0[0], 0, 3);
out0[3] = 0;
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 18: out1[3] = amd_bytealign_S (in0[3], in0[2], 2);
out1[2] = amd_bytealign_S (in0[2], in0[1], 2);
out1[1] = amd_bytealign_S (in0[1], in0[0], 2);
out1[0] = amd_bytealign_S (in0[0], 0, 2);
case 18: out1[3] = hc_bytealign_S (in0[3], in0[2], 2);
out1[2] = hc_bytealign_S (in0[2], in0[1], 2);
out1[1] = hc_bytealign_S (in0[1], in0[0], 2);
out1[0] = hc_bytealign_S (in0[0], 0, 2);
out0[3] = 0;
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 19: out1[3] = amd_bytealign_S (in0[3], in0[2], 1);
out1[2] = amd_bytealign_S (in0[2], in0[1], 1);
out1[1] = amd_bytealign_S (in0[1], in0[0], 1);
out1[0] = amd_bytealign_S (in0[0], 0, 1);
case 19: out1[3] = hc_bytealign_S (in0[3], in0[2], 1);
out1[2] = hc_bytealign_S (in0[2], in0[1], 1);
out1[1] = hc_bytealign_S (in0[1], in0[0], 1);
out1[0] = hc_bytealign_S (in0[0], 0, 1);
out0[3] = 0;
out0[2] = 0;
out0[1] = 0;
@ -646,27 +646,27 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 21: out1[3] = amd_bytealign_S (in0[2], in0[1], 3);
out1[2] = amd_bytealign_S (in0[1], in0[0], 3);
out1[1] = amd_bytealign_S (in0[0], 0, 3);
case 21: out1[3] = hc_bytealign_S (in0[2], in0[1], 3);
out1[2] = hc_bytealign_S (in0[1], in0[0], 3);
out1[1] = hc_bytealign_S (in0[0], 0, 3);
out1[0] = 0;
out0[3] = 0;
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 22: out1[3] = amd_bytealign_S (in0[2], in0[1], 2);
out1[2] = amd_bytealign_S (in0[1], in0[0], 2);
out1[1] = amd_bytealign_S (in0[0], 0, 2);
case 22: out1[3] = hc_bytealign_S (in0[2], in0[1], 2);
out1[2] = hc_bytealign_S (in0[1], in0[0], 2);
out1[1] = hc_bytealign_S (in0[0], 0, 2);
out1[0] = 0;
out0[3] = 0;
out0[2] = 0;
out0[1] = 0;
out0[0] = 0;
break;
case 23: out1[3] = amd_bytealign_S (in0[2], in0[1], 1);
out1[2] = amd_bytealign_S (in0[1], in0[0], 1);
out1[1] = amd_bytealign_S (in0[0], 0, 1);
case 23: out1[3] = hc_bytealign_S (in0[2], in0[1], 1);
out1[2] = hc_bytealign_S (in0[1], in0[0], 1);
out1[1] = hc_bytealign_S (in0[0], 0, 1);
out1[0] = 0;
out0[3] = 0;
out0[2] = 0;
@ -682,8 +682,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 25: out1[3] = amd_bytealign_S (in0[1], in0[0], 3);
out1[2] = amd_bytealign_S (in0[0], 0, 3);
case 25: out1[3] = hc_bytealign_S (in0[1], in0[0], 3);
out1[2] = hc_bytealign_S (in0[0], 0, 3);
out1[1] = 0;
out1[0] = 0;
out0[3] = 0;
@ -691,8 +691,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 26: out1[3] = amd_bytealign_S (in0[1], in0[0], 2);
out1[2] = amd_bytealign_S (in0[0], 0, 2);
case 26: out1[3] = hc_bytealign_S (in0[1], in0[0], 2);
out1[2] = hc_bytealign_S (in0[0], 0, 2);
out1[1] = 0;
out1[0] = 0;
out0[3] = 0;
@ -700,8 +700,8 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 27: out1[3] = amd_bytealign_S (in0[1], in0[0], 1);
out1[2] = amd_bytealign_S (in0[0], 0, 1);
case 27: out1[3] = hc_bytealign_S (in0[1], in0[0], 1);
out1[2] = hc_bytealign_S (in0[0], 0, 1);
out1[1] = 0;
out1[0] = 0;
out0[3] = 0;
@ -718,7 +718,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 29: out1[3] = amd_bytealign_S (in0[0], 0, 3);
case 29: out1[3] = hc_bytealign_S (in0[0], 0, 3);
out1[2] = 0;
out1[1] = 0;
out1[0] = 0;
@ -727,7 +727,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 30: out1[3] = amd_bytealign_S (in0[0], 0, 2);
case 30: out1[3] = hc_bytealign_S (in0[0], 0, 2);
out1[2] = 0;
out1[1] = 0;
out1[0] = 0;
@ -736,7 +736,7 @@ void rshift_block_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const
out0[1] = 0;
out0[0] = 0;
break;
case 31: out1[3] = amd_bytealign_S (in0[0], 0, 1);
case 31: out1[3] = hc_bytealign_S (in0[0], 0, 1);
out1[2] = 0;
out1[1] = 0;
out1[0] = 0;
@ -803,44 +803,44 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
switch (offset_switch)
{
case 0:
s7 = amd_bytealign_S (src_r12, src_r13, offset);
s6 = amd_bytealign_S (src_r11, src_r12, offset);
s5 = amd_bytealign_S (src_r10, src_r11, offset);
s4 = amd_bytealign_S (src_r03, src_r10, offset);
s3 = amd_bytealign_S (src_r02, src_r03, offset);
s2 = amd_bytealign_S (src_r01, src_r02, offset);
s1 = amd_bytealign_S (src_r00, src_r01, offset);
s0 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r12, src_r13, offset);
s6 = hc_bytealign_S (src_r11, src_r12, offset);
s5 = hc_bytealign_S (src_r10, src_r11, offset);
s4 = hc_bytealign_S (src_r03, src_r10, offset);
s3 = hc_bytealign_S (src_r02, src_r03, offset);
s2 = hc_bytealign_S (src_r01, src_r02, offset);
s1 = hc_bytealign_S (src_r00, src_r01, offset);
s0 = hc_bytealign_S ( 0, src_r00, offset);
break;
case 1:
s7 = amd_bytealign_S (src_r11, src_r12, offset);
s6 = amd_bytealign_S (src_r10, src_r11, offset);
s5 = amd_bytealign_S (src_r03, src_r10, offset);
s4 = amd_bytealign_S (src_r02, src_r03, offset);
s3 = amd_bytealign_S (src_r01, src_r02, offset);
s2 = amd_bytealign_S (src_r00, src_r01, offset);
s1 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r11, src_r12, offset);
s6 = hc_bytealign_S (src_r10, src_r11, offset);
s5 = hc_bytealign_S (src_r03, src_r10, offset);
s4 = hc_bytealign_S (src_r02, src_r03, offset);
s3 = hc_bytealign_S (src_r01, src_r02, offset);
s2 = hc_bytealign_S (src_r00, src_r01, offset);
s1 = hc_bytealign_S ( 0, src_r00, offset);
s0 = 0;
break;
case 2:
s7 = amd_bytealign_S (src_r10, src_r11, offset);
s6 = amd_bytealign_S (src_r03, src_r10, offset);
s5 = amd_bytealign_S (src_r02, src_r03, offset);
s4 = amd_bytealign_S (src_r01, src_r02, offset);
s3 = amd_bytealign_S (src_r00, src_r01, offset);
s2 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r10, src_r11, offset);
s6 = hc_bytealign_S (src_r03, src_r10, offset);
s5 = hc_bytealign_S (src_r02, src_r03, offset);
s4 = hc_bytealign_S (src_r01, src_r02, offset);
s3 = hc_bytealign_S (src_r00, src_r01, offset);
s2 = hc_bytealign_S ( 0, src_r00, offset);
s1 = 0;
s0 = 0;
break;
case 3:
s7 = amd_bytealign_S (src_r03, src_r10, offset);
s6 = amd_bytealign_S (src_r02, src_r03, offset);
s5 = amd_bytealign_S (src_r01, src_r02, offset);
s4 = amd_bytealign_S (src_r00, src_r01, offset);
s3 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r03, src_r10, offset);
s6 = hc_bytealign_S (src_r02, src_r03, offset);
s5 = hc_bytealign_S (src_r01, src_r02, offset);
s4 = hc_bytealign_S (src_r00, src_r01, offset);
s3 = hc_bytealign_S ( 0, src_r00, offset);
s2 = 0;
s1 = 0;
s0 = 0;
@ -848,10 +848,10 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 4:
s7 = amd_bytealign_S (src_r02, src_r03, offset);
s6 = amd_bytealign_S (src_r01, src_r02, offset);
s5 = amd_bytealign_S (src_r00, src_r01, offset);
s4 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r02, src_r03, offset);
s6 = hc_bytealign_S (src_r01, src_r02, offset);
s5 = hc_bytealign_S (src_r00, src_r01, offset);
s4 = hc_bytealign_S ( 0, src_r00, offset);
s3 = 0;
s2 = 0;
s1 = 0;
@ -859,9 +859,9 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 5:
s7 = amd_bytealign_S (src_r01, src_r02, offset);
s6 = amd_bytealign_S (src_r00, src_r01, offset);
s5 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r01, src_r02, offset);
s6 = hc_bytealign_S (src_r00, src_r01, offset);
s5 = hc_bytealign_S ( 0, src_r00, offset);
s4 = 0;
s3 = 0;
s2 = 0;
@ -870,8 +870,8 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 6:
s7 = amd_bytealign_S (src_r00, src_r01, offset);
s6 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S (src_r00, src_r01, offset);
s6 = hc_bytealign_S ( 0, src_r00, offset);
s5 = 0;
s4 = 0;
s3 = 0;
@ -881,7 +881,7 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 7:
s7 = amd_bytealign_S ( 0, src_r00, offset);
s7 = hc_bytealign_S ( 0, src_r00, offset);
s6 = 0;
s5 = 0;
s4 = 0;
@ -928,44 +928,44 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
switch (offset_switch)
{
case 0:
s7 = __byte_perm_S (src_r12, src_r13, selector);
s6 = __byte_perm_S (src_r11, src_r12, selector);
s5 = __byte_perm_S (src_r10, src_r11, selector);
s4 = __byte_perm_S (src_r03, src_r10, selector);
s3 = __byte_perm_S (src_r02, src_r03, selector);
s2 = __byte_perm_S (src_r01, src_r02, selector);
s1 = __byte_perm_S (src_r00, src_r01, selector);
s0 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r12, src_r13, selector);
s6 = hc_byte_perm_S (src_r11, src_r12, selector);
s5 = hc_byte_perm_S (src_r10, src_r11, selector);
s4 = hc_byte_perm_S (src_r03, src_r10, selector);
s3 = hc_byte_perm_S (src_r02, src_r03, selector);
s2 = hc_byte_perm_S (src_r01, src_r02, selector);
s1 = hc_byte_perm_S (src_r00, src_r01, selector);
s0 = hc_byte_perm_S ( 0, src_r00, selector);
break;
case 1:
s7 = __byte_perm_S (src_r11, src_r12, selector);
s6 = __byte_perm_S (src_r10, src_r11, selector);
s5 = __byte_perm_S (src_r03, src_r10, selector);
s4 = __byte_perm_S (src_r02, src_r03, selector);
s3 = __byte_perm_S (src_r01, src_r02, selector);
s2 = __byte_perm_S (src_r00, src_r01, selector);
s1 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r11, src_r12, selector);
s6 = hc_byte_perm_S (src_r10, src_r11, selector);
s5 = hc_byte_perm_S (src_r03, src_r10, selector);
s4 = hc_byte_perm_S (src_r02, src_r03, selector);
s3 = hc_byte_perm_S (src_r01, src_r02, selector);
s2 = hc_byte_perm_S (src_r00, src_r01, selector);
s1 = hc_byte_perm_S ( 0, src_r00, selector);
s0 = 0;
break;
case 2:
s7 = __byte_perm_S (src_r10, src_r11, selector);
s6 = __byte_perm_S (src_r03, src_r10, selector);
s5 = __byte_perm_S (src_r02, src_r03, selector);
s4 = __byte_perm_S (src_r01, src_r02, selector);
s3 = __byte_perm_S (src_r00, src_r01, selector);
s2 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r10, src_r11, selector);
s6 = hc_byte_perm_S (src_r03, src_r10, selector);
s5 = hc_byte_perm_S (src_r02, src_r03, selector);
s4 = hc_byte_perm_S (src_r01, src_r02, selector);
s3 = hc_byte_perm_S (src_r00, src_r01, selector);
s2 = hc_byte_perm_S ( 0, src_r00, selector);
s1 = 0;
s0 = 0;
break;
case 3:
s7 = __byte_perm_S (src_r03, src_r10, selector);
s6 = __byte_perm_S (src_r02, src_r03, selector);
s5 = __byte_perm_S (src_r01, src_r02, selector);
s4 = __byte_perm_S (src_r00, src_r01, selector);
s3 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r03, src_r10, selector);
s6 = hc_byte_perm_S (src_r02, src_r03, selector);
s5 = hc_byte_perm_S (src_r01, src_r02, selector);
s4 = hc_byte_perm_S (src_r00, src_r01, selector);
s3 = hc_byte_perm_S ( 0, src_r00, selector);
s2 = 0;
s1 = 0;
s0 = 0;
@ -973,10 +973,10 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 4:
s7 = __byte_perm_S (src_r02, src_r03, selector);
s6 = __byte_perm_S (src_r01, src_r02, selector);
s5 = __byte_perm_S (src_r00, src_r01, selector);
s4 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r02, src_r03, selector);
s6 = hc_byte_perm_S (src_r01, src_r02, selector);
s5 = hc_byte_perm_S (src_r00, src_r01, selector);
s4 = hc_byte_perm_S ( 0, src_r00, selector);
s3 = 0;
s2 = 0;
s1 = 0;
@ -984,9 +984,9 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 5:
s7 = __byte_perm_S (src_r01, src_r02, selector);
s6 = __byte_perm_S (src_r00, src_r01, selector);
s5 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r01, src_r02, selector);
s6 = hc_byte_perm_S (src_r00, src_r01, selector);
s5 = hc_byte_perm_S ( 0, src_r00, selector);
s4 = 0;
s3 = 0;
s2 = 0;
@ -995,8 +995,8 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 6:
s7 = __byte_perm_S (src_r00, src_r01, selector);
s6 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S (src_r00, src_r01, selector);
s6 = hc_byte_perm_S ( 0, src_r00, selector);
s5 = 0;
s4 = 0;
s3 = 0;
@ -1006,7 +1006,7 @@ void append_block8 (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, c
break;
case 7:
s7 = __byte_perm_S ( 0, src_r00, selector);
s7 = hc_byte_perm_S ( 0, src_r00, selector);
s6 = 0;
s5 = 0;
s4 = 0;
@ -1681,10 +1681,10 @@ u32 search_on_register (const u32 in, const u32 p0)
{
u32 r = 0;
if (__bfe_S (in, 0, 8) == p0) r |= 1;
if (__bfe_S (in, 8, 8) == p0) r |= 2;
if (__bfe_S (in, 16, 8) == p0) r |= 4;
if (__bfe_S (in, 24, 8) == p0) r |= 8;
if (hc_bfe_S (in, 0, 8) == p0) r |= 1;
if (hc_bfe_S (in, 8, 8) == p0) r |= 2;
if (hc_bfe_S (in, 16, 8) == p0) r |= 4;
if (hc_bfe_S (in, 24, 8) == p0) r |= 8;
return r;
}

@ -351,23 +351,28 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n)
return rotr64 (a, 64 - n);
}
DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
return amd_bfe (a, b, c);
}
DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
return amd_bfe (a, b, c);
}
DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bytealign (const u32x a, const u32x b, const u32x c)
{
return amd_bytealign (a, b, c);
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
{
return amd_bytealign (a, b, c);
}
#if AMD_GCN >= 3
DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -420,7 +425,7 @@ DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c)
return r;
}
DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -431,7 +436,7 @@ DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
#endif
#if AMD_GCN >= 5
DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -484,7 +489,7 @@ DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c)
return r;
}
DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -493,12 +498,12 @@ DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c)
return r;
}
#else
DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}
@ -741,7 +746,7 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n)
return rotate (a, (u64x) n);
}
DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -780,7 +785,7 @@ DECLSPEC u32x __byte_perm (const u32x a, const u32x b, const u32x c)
return r;
}
DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -789,7 +794,7 @@ DECLSPEC u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
return r;
}
DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -828,7 +833,7 @@ DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c)
return r;
}
DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -837,7 +842,7 @@ DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c)
return r;
}
DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -877,14 +882,14 @@ DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32x c)
#else
r = __byte_perm (b, a, ((u32x) (0x76543210) >> ((c & 3) * 4)) & 0xffff);
r = hc_byte_perm (b, a, ((u32x) (0x76543210) >> ((c & 3) * 4)) & 0xffff);
#endif
return r;
}
DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -894,19 +899,19 @@ DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
#else
r = __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
r = hc_byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff);
#endif
return r;
}
DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}
@ -984,7 +989,7 @@ DECLSPEC u64x rotl64 (const u64x a, const u32 n)
return rotate (a, (u64x) n);
}
DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_bfe (const u32x a, const u32x b, const u32x c)
{
#define BIT(x) ((u32x) (1u) << (x))
#define BIT_MASK(x) (BIT (x) - 1)
@ -997,7 +1002,7 @@ DECLSPEC u32x __bfe (const u32x a, const u32x b, const u32x c)
#undef BFE
}
DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
{
#define BIT(x) (1u << (x))
#define BIT_MASK(x) (BIT (x) - 1)
@ -1010,7 +1015,7 @@ DECLSPEC u32 __bfe_S (const u32 a, const u32 b, const u32 c)
#undef BFE
}
DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32 c)
{
#if VECT_SIZE == 1
const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
@ -1043,19 +1048,19 @@ DECLSPEC u32x amd_bytealign (const u32x a, const u32x b, const u32 c)
#endif
}
DECLSPEC u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
{
const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
return (u32) (tmp);
}
DECLSPEC u32x __add3 (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
DECLSPEC u32 __add3_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;
}

@ -101,9 +101,13 @@
*/
#ifdef IS_AMD
#if defined(cl_amd_media_ops)
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#endif
#if defined(cl_amd_media_ops2)
#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
#endif
#endif
/**
* Unrolling is generally enabled, for all device types and hash modes

@ -35,11 +35,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -56,11 +56,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -149,11 +149,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -171,11 +171,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -259,9 +259,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -274,9 +274,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

@ -34,11 +34,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -55,11 +55,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -148,11 +148,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -170,11 +170,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -258,9 +258,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -273,9 +273,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

@ -307,10 +307,10 @@ __constant u32a c_sbox3[256] =
{ \
u32 tmp; \
\
tmp = S0[__bfe ((L), 24, 8)]; \
tmp += S1[__bfe ((L), 16, 8)]; \
tmp ^= S2[__bfe ((L), 8, 8)]; \
tmp += S3[__bfe ((L), 0, 8)]; \
tmp = S0[hc_bfe ((L), 24, 8)]; \
tmp += S1[hc_bfe ((L), 16, 8)]; \
tmp ^= S2[hc_bfe ((L), 8, 8)]; \
tmp += S3[hc_bfe ((L), 0, 8)]; \
\
(R) ^= tmp ^ P[(N)]; \
}

@ -2123,12 +2123,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
u32 in3 = swap32_S (append[3]);
u32 in4 = swap32_S (append[4]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp5 = amd_bytealign (in4, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp5 = hc_bytealign (in4, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -2147,12 +2147,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
u32 in3 = append[3];
u32 in4 = append[4];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp5 = __byte_perm (in4, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, in4, selector);
tmp5 = hc_byte_perm (in4, 0, selector);
#endif
const u32 div = offset / 4;

@ -2123,12 +2123,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
u32 in3 = swap32_S (append[3]);
u32 in4 = swap32_S (append[4]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp5 = amd_bytealign (in4, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp5 = hc_bytealign (in4, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -2147,12 +2147,12 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
u32 in3 = append[3];
u32 in4 = append[4];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp3 = __byte_perm (in3, in4, selector);
tmp4 = __byte_perm (in4, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp3 = hc_byte_perm (in3, in4, selector);
tmp4 = hc_byte_perm (in4, 0, selector);
#endif
const u32 div = offset / 4;

@ -31,11 +31,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -52,11 +52,11 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -145,11 +145,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -167,11 +167,11 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -255,9 +255,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -270,9 +270,9 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

@ -102,11 +102,11 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -123,11 +123,11 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
switch (offset / 4)
@ -243,11 +243,11 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -264,11 +264,11 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
u32 carry[4] = { 0, 0, 0, 0 };
@ -410,11 +410,11 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -431,11 +431,11 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, 0, selector);
#endif
switch (offset / 4)
@ -560,11 +560,11 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
@ -582,11 +582,11 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp0 = hc_byte_perm ( 0, in0, selector);
tmp1 = hc_byte_perm (in0, in1, selector);
tmp2 = hc_byte_perm (in1, in2, selector);
tmp3 = hc_byte_perm (in2, in3, selector);
tmp4 = hc_byte_perm (in3, in4, selector);
#endif
switch (offset / 4)

@ -316,10 +316,10 @@ __constant u32a c_pbox[18] =
{ \
u32 tmp; \
\
tmp = S0[__bfe_S ((L), 24, 8)]; \
tmp += S1[__bfe_S ((L), 16, 8)]; \
tmp ^= S2[__bfe_S ((L), 8, 8)]; \
tmp += S3[__bfe_S ((L), 0, 8)]; \
tmp = S0[hc_bfe_S ((L), 24, 8)]; \
tmp += S1[hc_bfe_S ((L), 16, 8)]; \
tmp ^= S2[hc_bfe_S ((L), 8, 8)]; \
tmp += S3[hc_bfe_S ((L), 0, 8)]; \
\
(R) ^= tmp ^ P[(N)]; \
}

@ -200,11 +200,11 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
#if defined IS_AMD || defined IS_GENERIC
for (i = 0; i < pd; i++) sc[idx++] = pw[i];
sc[idx++] = pw[i]
| amd_bytealign (bl[0], 0, pm4);
for (i = 1; i < bd; i++) sc[idx++] = amd_bytealign (bl[i], bl[i - 1], pm4);
sc[idx++] = amd_bytealign (sc[0], bl[i - 1], pm4);
for (i = 1; i < 4; i++) sc[idx++] = amd_bytealign (sc[i], sc[i - 1], pm4);
sc[idx++] = amd_bytealign ( 0, sc[i - 1], pm4);
| hc_bytealign (bl[0], 0, pm4);
for (i = 1; i < bd; i++) sc[idx++] = hc_bytealign (bl[i], bl[i - 1], pm4);
sc[idx++] = hc_bytealign (sc[0], bl[i - 1], pm4);
for (i = 1; i < 4; i++) sc[idx++] = hc_bytealign (sc[i], sc[i - 1], pm4);
sc[idx++] = hc_bytealign ( 0, sc[i - 1], pm4);
#endif
#ifdef IS_NV
@ -212,11 +212,11 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
for (i = 0; i < pd; i++) sc[idx++] = pw[i];
sc[idx++] = pw[i]
| __byte_perm ( 0, bl[0], selector);
for (i = 1; i < bd; i++) sc[idx++] = __byte_perm (bl[i - 1], bl[i], selector);
sc[idx++] = __byte_perm (bl[i - 1], sc[0], selector);
for (i = 1; i < 4; i++) sc[idx++] = __byte_perm (sc[i - 1], sc[i], selector);
sc[idx++] = __byte_perm (sc[i - 1], 0, selector);
| hc_byte_perm ( 0, bl[0], selector);
for (i = 1; i < bd; i++) sc[idx++] = hc_byte_perm (bl[i - 1], bl[i], selector);
sc[idx++] = hc_byte_perm (bl[i - 1], sc[0], selector);
for (i = 1; i < 4; i++) sc[idx++] = hc_byte_perm (sc[i - 1], sc[i], selector);
sc[idx++] = hc_byte_perm (sc[i - 1], 0, selector);
#endif
}
}
@ -229,19 +229,19 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con
const u32 od = m / 4;
#if defined IS_AMD || defined IS_GENERIC
pt[0] = amd_bytealign (sc[od + 1], sc[od + 0], om);
pt[1] = amd_bytealign (sc[od + 2], sc[od + 1], om);
pt[2] = amd_bytealign (sc[od + 3], sc[od + 2], om);
pt[3] = amd_bytealign (sc[od + 4], sc[od + 3], om);
pt[0] = hc_bytealign (sc[od + 1], sc[od + 0], om);
pt[1] = hc_bytealign (sc[od + 2], sc[od + 1], om);
pt[2] = hc_bytealign (sc[od + 3], sc[od + 2], om);
pt[3] = hc_bytealign (sc[od + 4], sc[od + 3], om);
#endif
#ifdef IS_NV
int selector = (0x76543210 >> (om * 4)) & 0xffff;
pt[0] = __byte_perm (sc[od + 0], sc[od + 1], selector);
pt[1] = __byte_perm (sc[od + 1], sc[od + 2], selector);
pt[2] = __byte_perm (sc[od + 2], sc[od + 3], selector);
pt[3] = __byte_perm (sc[od + 3], sc[od + 4], selector);
pt[0] = hc_byte_perm (sc[od + 0], sc[od + 1], selector);
pt[1] = hc_byte_perm (sc[od + 1], sc[od + 2], selector);
pt[2] = hc_byte_perm (sc[od + 2], sc[od + 3], selector);
pt[3] = hc_byte_perm (sc[od + 3], sc[od + 4], selector);
#endif
}

@ -23,13 +23,13 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
#ifdef IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
tmp0 = __byte_perm (append, 0, selector);
tmp1 = __byte_perm (0, append, selector);
tmp0 = hc_byte_perm (append, 0, selector);
tmp1 = hc_byte_perm (0, append, selector);
#endif
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (0, append, func_len);
tmp1 = amd_bytealign (append, 0, func_len);
tmp0 = hc_bytealign (0, append, func_len);
tmp1 = hc_bytealign (append, 0, func_len);
#endif
u32 carry = 0;

@ -45,45 +45,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
tmp00 = __byte_perm (carry[ 0], 0, selector);
tmp01 = __byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = __byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = __byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = __byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = __byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = __byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = __byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = __byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = __byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = __byte_perm (carry[10], carry[ 9], selector);
tmp11 = __byte_perm (carry[11], carry[10], selector);
tmp12 = __byte_perm (carry[12], carry[11], selector);
tmp13 = __byte_perm (carry[13], carry[12], selector);
tmp14 = __byte_perm (carry[14], carry[13], selector);
tmp15 = __byte_perm (carry[15], carry[14], selector);
tmp16 = __byte_perm ( 0, carry[15], selector);
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = hc_byte_perm (carry[10], carry[ 9], selector);
tmp11 = hc_byte_perm (carry[11], carry[10], selector);
tmp12 = hc_byte_perm (carry[12], carry[11], selector);
tmp13 = hc_byte_perm (carry[13], carry[12], selector);
tmp14 = hc_byte_perm (carry[14], carry[13], selector);
tmp15 = hc_byte_perm (carry[15], carry[14], selector);
tmp16 = hc_byte_perm ( 0, carry[15], selector);
#endif
carry[ 0] = 0;

@ -43,45 +43,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
tmp00 = __byte_perm (carry[ 0], 0, selector);
tmp01 = __byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = __byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = __byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = __byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = __byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = __byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = __byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = __byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = __byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = __byte_perm (carry[10], carry[ 9], selector);
tmp11 = __byte_perm (carry[11], carry[10], selector);
tmp12 = __byte_perm (carry[12], carry[11], selector);
tmp13 = __byte_perm (carry[13], carry[12], selector);
tmp14 = __byte_perm (carry[14], carry[13], selector);
tmp15 = __byte_perm (carry[15], carry[14], selector);
tmp16 = __byte_perm ( 0, carry[15], selector);
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = hc_byte_perm (carry[10], carry[ 9], selector);
tmp11 = hc_byte_perm (carry[11], carry[10], selector);
tmp12 = hc_byte_perm (carry[12], carry[11], selector);
tmp13 = hc_byte_perm (carry[13], carry[12], selector);
tmp14 = hc_byte_perm (carry[14], carry[13], selector);
tmp15 = hc_byte_perm (carry[15], carry[14], selector);
tmp16 = hc_byte_perm ( 0, carry[15], selector);
#endif
carry[ 0] = 0;

@ -42,45 +42,45 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
tmp00 = __byte_perm (carry[ 0], 0, selector);
tmp01 = __byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = __byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = __byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = __byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = __byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = __byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = __byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = __byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = __byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = __byte_perm (carry[10], carry[ 9], selector);
tmp11 = __byte_perm (carry[11], carry[10], selector);
tmp12 = __byte_perm (carry[12], carry[11], selector);
tmp13 = __byte_perm (carry[13], carry[12], selector);
tmp14 = __byte_perm (carry[14], carry[13], selector);
tmp15 = __byte_perm (carry[15], carry[14], selector);
tmp16 = __byte_perm ( 0, carry[15], selector);
tmp00 = hc_byte_perm (carry[ 0], 0, selector);
tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector);
tmp02 = hc_byte_perm (carry[ 2], carry[ 1], selector);
tmp03 = hc_byte_perm (carry[ 3], carry[ 2], selector);
tmp04 = hc_byte_perm (carry[ 4], carry[ 3], selector);
tmp05 = hc_byte_perm (carry[ 5], carry[ 4], selector);
tmp06 = hc_byte_perm (carry[ 6], carry[ 5], selector);
tmp07 = hc_byte_perm (carry[ 7], carry[ 6], selector);
tmp08 = hc_byte_perm (carry[ 8], carry[ 7], selector);
tmp09 = hc_byte_perm (carry[ 9], carry[ 8], selector);
tmp10 = hc_byte_perm (carry[10], carry[ 9], selector);
tmp11 = hc_byte_perm (carry[11], carry[10], selector);
tmp12 = hc_byte_perm (carry[12], carry[11], selector);
tmp13 = hc_byte_perm (carry[13], carry[12], selector);
tmp14 = hc_byte_perm (carry[14], carry[13], selector);
tmp15 = hc_byte_perm (carry[15], carry[14], selector);
tmp16 = hc_byte_perm ( 0, carry[15], selector);
#endif
carry[ 0] = 0;

Loading…
Cancel
Save