Fixed some speeds:

1800
3200
7100
7200
7400
8200
9300
9000
9600
11300
11600
pull/30/head
jsteube 9 years ago
parent 806f73b685
commit 7be2c2fd8d

@ -141,6 +141,7 @@ static void sha512_transform (const u64 w[16], u64 digest[8])
digest[7] += h;
}
#ifdef IS_AMD
static void sha512_transform_workaround (const u64 w[16], u64 digest[8])
{
u64 w0_t = w[ 0];
@ -205,6 +206,7 @@ static void sha512_transform_workaround (const u64 w[16], u64 digest[8])
digest[6] += g;
digest[7] += h;
}
#endif
static void sha512_init (sha512_ctx_t *sha512_ctx)
{
@ -243,7 +245,13 @@ static void sha512_update (sha512_ctx_t *sha512_ctx, const u64 *buf, int len)
PUTCHAR64_BE (sha512_ctx->buf, pos++, GETCHAR64_BE (buf, i));
}
#ifdef IS_AMD
sha512_transform_workaround (sha512_ctx->buf, sha512_ctx->state);
#endif
#ifdef IS_NV
sha512_transform (sha512_ctx->buf, sha512_ctx->state);
#endif
len -= cnt;

@ -303,6 +303,7 @@ __constant u32 c_pbox[18] =
0x9216d5d9, 0x8979fb1b
};
/*
#define BF_ROUND(L,R,N) \
{ \
uchar4 c = as_uchar4 ((L)); \
@ -316,6 +317,20 @@ __constant u32 c_pbox[18] =
\
(R) ^= tmp ^ P[(N)]; \
}
*/
#define BF_ROUND(L,R,N) \
{ \
u32 tmp; \
\
tmp = S0[((L) >> 24) & 0xff]; \
tmp += S1[((L) >> 16) & 0xff]; \
tmp ^= S2[((L) >> 8) & 0xff]; \
tmp += S3[((L) >> 0) & 0xff]; \
\
(R) ^= tmp ^ P[(N)]; \
}
#define BF_ENCRYPT(L,R) \
{ \

@ -1001,7 +1001,7 @@ __constant u64 k_sha512[80] =
SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f,
};
static void sha512_transform (volatile const u64 w0[4], volatile const u64 w1[4], volatile const u64 w2[4], volatile const u64 w3[4], volatile u64 dgst[8])
static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 dgst[8])
{
u64 a = dgst[0];
u64 b = dgst[1];
@ -1071,7 +1071,7 @@ static void sha512_transform (volatile const u64 w0[4], volatile const u64 w1[4]
ROUND_STEP (0);
#pragma unroll
//#pragma unroll
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);

@ -925,22 +925,22 @@ static void sha512_transform (const u64 w[16], u64 dgst[8])
u64 g = dgst[6];
u64 h = dgst[7];
volatile u64 w0_t = w[ 0];
volatile u64 w1_t = w[ 1];
volatile u64 w2_t = w[ 2];
volatile u64 w3_t = w[ 3];
volatile u64 w4_t = w[ 4];
volatile u64 w5_t = w[ 5];
volatile u64 w6_t = w[ 6];
volatile u64 w7_t = w[ 7];
volatile u64 w8_t = w[ 8];
volatile u64 w9_t = w[ 9];
volatile u64 wa_t = w[10];
volatile u64 wb_t = w[11];
volatile u64 wc_t = w[12];
volatile u64 wd_t = w[13];
volatile u64 we_t = w[14];
volatile u64 wf_t = w[15];
u64 w0_t = w[ 0];
u64 w1_t = w[ 1];
u64 w2_t = w[ 2];
u64 w3_t = w[ 3];
u64 w4_t = w[ 4];
u64 w5_t = w[ 5];
u64 w6_t = w[ 6];
u64 w7_t = w[ 7];
u64 w8_t = w[ 8];
u64 w9_t = w[ 9];
u64 wa_t = w[10];
u64 wb_t = w[11];
u64 wc_t = w[12];
u64 wd_t = w[13];
u64 we_t = w[14];
u64 wf_t = w[15];
#define ROUND_EXPAND() \
{ \
@ -984,7 +984,7 @@ static void sha512_transform (const u64 w[16], u64 dgst[8])
ROUND_STEP (0);
#pragma unroll
// #pragma unroll
for (int i = 16; i < 80; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);

@ -181,78 +181,26 @@ static u64 rotl64 (const u64 a, const u32 n)
#ifdef IS_NV
#if CUDA_ARCH >= 350
static u32 rotr32 (const u32 a, const u32 n)
{
u32 r;
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(a), "r"(n));
return r;
}
static u32 rotl32 (const u32 a, const u32 n)
{
return rotr32 (a, 32 - n);
}
static u64 rotr64 (const u64 a, const u32 n)
{
u32 il;
u32 ir;
asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a));
u32 tl;
u32 tr;
if (n >= 32)
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32));
}
else
{
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n));
asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n));
}
u64 r;
asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr));
return r;
}
static u64 rotl64 (const u64 a, const u32 n)
{
return rotr64 (a, 64 - n);
}
#else
static u32 rotr32 (const u32 a, const u32 n)
{
return (((a) >> (n)) | ((a) << (32 - (n))));
return rotate (a, 32 - n);
}
static u32 rotl32 (const u32 a, const u32 n)
{
return rotr32 (a, 32 - n);
return rotate (a, n);
}
static u64 rotr64 (const u64 a, const u32 n)
static u64 rotr64 (const u64 a, const u64 n)
{
return (((a) >> (n)) | ((a) << (64 - (n))));
return rotate (a, 64 - n);
}
static u64 rotl64 (const u64 a, const u32 n)
static u64 rotl64 (const u64 a, const u64 n)
{
return rotr64 (a, 64 - n);
return rotate (a, n);
}
#endif
#endif
typedef struct

@ -284,8 +284,8 @@
#define SHA512_EXPAND(x,y,z,w) (SHA512_S3 (x) + y + SHA512_S2 (z) + w)
#define SHA512_S2_WO(x) (rotr64 ((x), 1) ^ rotr64 ((x), 8) ^ SHIFT_RIGHT_64 ((x), 7))
#define SHA512_S3_WO(x) (rotr64 ((x), 19) ^ rotr64 ((x), 61) ^ SHIFT_RIGHT_64 ((x), 6))
#define SHA512_S2_WO(x) (rotate ((x), 64- 1ull) ^ rotate ((x), 64- 8ull) ^ SHIFT_RIGHT_64 ((x), 7))
#define SHA512_S3_WO(x) (rotate ((x), 64-19ull) ^ rotate ((x), 64-61ull) ^ SHIFT_RIGHT_64 ((x), 6))
#define SHA512_EXPAND_WO(x,y,z,w) (SHA512_S3_WO (x) + y + SHA512_S2_WO (z) + w)
#endif

@ -256,7 +256,8 @@ void hc_clBuildProgram (cl_program program, cl_uint num_devices, const cl_device
{
log_error ("ERROR: %s %d\n", "clBuildProgram()", CL_err);
exit (-1);
// If we exit here we can't see the error message
// exit (-1);
}
}

@ -11130,7 +11130,7 @@ int main (int argc, char **argv)
gpu_accel = 32;
break;
case 1800: gpu_loops = ROUNDS_SHA512CRYPT;
gpu_accel = 8;
gpu_accel = 16;
break;
case 2100: gpu_loops = ROUNDS_DCC2;
gpu_accel = 16;
@ -11139,7 +11139,7 @@ int main (int argc, char **argv)
gpu_accel = 32;
break;
case 3200: gpu_loops = ROUNDS_BCRYPT;
gpu_accel = 2;
gpu_accel = 8;
break;
case 5200: gpu_loops = ROUNDS_PSAFE3;
gpu_accel = 16;
@ -11202,19 +11202,19 @@ int main (int argc, char **argv)
gpu_accel = 64;
break;
case 7100: gpu_loops = ROUNDS_SHA512OSX;
gpu_accel = 2;
gpu_accel = 8;
break;
case 7200: gpu_loops = ROUNDS_GRUB;
gpu_accel = 2;
gpu_accel = 16;
break;
case 7400: gpu_loops = ROUNDS_SHA256CRYPT;
gpu_accel = 4;
gpu_accel = 8;
break;
case 7900: gpu_loops = ROUNDS_DRUPAL7;
gpu_accel = 8;
break;
case 8200: gpu_loops = ROUNDS_CLOUDKEY;
gpu_accel = 2;
gpu_accel = 8;
break;
case 8800: gpu_loops = ROUNDS_ANDROIDFDE;
gpu_accel = 32;
@ -11232,7 +11232,7 @@ int main (int argc, char **argv)
gpu_accel = 8;
break;
case 9300: gpu_loops = 1;
gpu_accel = 4;
gpu_accel = 8;
break;
case 9400: gpu_loops = ROUNDS_OFFICE2007;
gpu_accel = 32;
@ -11241,7 +11241,7 @@ int main (int argc, char **argv)
gpu_accel = 32;
break;
case 9600: gpu_loops = ROUNDS_OFFICE2013;
gpu_accel = 4;
gpu_accel = 8;
break;
case 10000: gpu_loops = ROUNDS_DJANGOPBKDF2;
gpu_accel = 8;
@ -11259,10 +11259,10 @@ int main (int argc, char **argv)
gpu_accel = 8;
break;
case 11300: gpu_loops = ROUNDS_BITCOIN_WALLET;
gpu_accel = 2;
gpu_accel = 8;
break;
case 11600: gpu_loops = ROUNDS_SEVEN_ZIP;
gpu_accel = 4;
gpu_accel = 8;
break;
case 11900: gpu_loops = ROUNDS_PBKDF2_MD5;
gpu_accel = 8;

Loading…
Cancel
Save