Cracking bcrypt: Use a feedback from the OpenCL runtime to dynamically find out optimal thread count

pull/1939/head^2
jsteube 5 years ago
parent 42e440611a
commit 5ecbcde945

@ -299,16 +299,21 @@ __constant u32a c_sbox3[256] =
0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6
};
#define BF_ROUND(L,R,N) \
{ \
u32 tmp; \
\
tmp = S0[hc_bfe_S ((L), 24, 8)]; \
tmp += S1[hc_bfe_S ((L), 16, 8)]; \
tmp ^= S2[hc_bfe_S ((L), 8, 8)]; \
tmp += S3[hc_bfe_S ((L), 0, 8)]; \
\
(R) ^= tmp ^ P[(N)]; \
#define BF_ROUND(L,R,N) \
{ \
u32 tmp; \
\
const u32 r0 = hc_bfe_S ((L), 24, 8); \
const u32 r1 = hc_bfe_S ((L), 16, 8); \
const u32 r2 = hc_bfe_S ((L), 8, 8); \
const u32 r3 = hc_bfe_S ((L), 0, 8); \
\
tmp = S0[r0]; \
tmp += S1[r1]; \
tmp ^= S2[r2]; \
tmp += S3[r3]; \
\
(R) ^= tmp ^ P[(N)]; \
}
#define BF_ENCRYPT(L,R) \
@ -361,7 +366,7 @@ DECLSPEC void expand_key (u32 *E, u32 *W, const int len)
}
}
__kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
{
/**
* base
@ -438,10 +443,10 @@ __kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
* do the key setup
*/
__local u32 S0_all[8][256];
__local u32 S1_all[8][256];
__local u32 S2_all[8][256];
__local u32 S3_all[8][256];
__local u32 S0_all[FIXED_LOCAL_SIZE][256];
__local u32 S1_all[FIXED_LOCAL_SIZE][256];
__local u32 S2_all[FIXED_LOCAL_SIZE][256];
__local u32 S3_all[FIXED_LOCAL_SIZE][256];
__local u32 *S0 = S0_all[lid];
__local u32 *S1 = S1_all[lid];
@ -580,7 +585,7 @@ __kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t))
}
}
__kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
{
/**
* base
@ -607,10 +612,10 @@ __kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
P[i] = tmps[gid].P[i];
}
__local u32 S0_all[8][256];
__local u32 S1_all[8][256];
__local u32 S2_all[8][256];
__local u32 S3_all[8][256];
__local u32 S0_all[FIXED_LOCAL_SIZE][256];
__local u32 S1_all[FIXED_LOCAL_SIZE][256];
__local u32 S2_all[FIXED_LOCAL_SIZE][256];
__local u32 S3_all[FIXED_LOCAL_SIZE][256];
__local u32 *S0 = S0_all[lid];
__local u32 *S1 = S1_all[lid];
@ -778,7 +783,7 @@ __kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t))
}
}
__kernel void m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
{
/**
* base
@ -798,10 +803,10 @@ __kernel void m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t))
P[i] = tmps[gid].P[i];
}
__local u32 S0_all[8][256];
__local u32 S1_all[8][256];
__local u32 S2_all[8][256];
__local u32 S3_all[8][256];
__local u32 S0_all[FIXED_LOCAL_SIZE][256];
__local u32 S1_all[FIXED_LOCAL_SIZE][256];
__local u32 S2_all[FIXED_LOCAL_SIZE][256];
__local u32 S3_all[FIXED_LOCAL_SIZE][256];
__local u32 *S0 = S0_all[lid];
__local u32 *S1 = S1_all[lid];

@ -43,6 +43,7 @@
## Improvements
##
- Cracking bcrypt: Use a feedback from the OpenCL runtime to dynamically find out optimal thread count
- Bitcoin Wallet: Be more user friendly by allowing a larger data range for ckey and public_key
- Building: Updated BUILD.md
- My Wallet: Added additional plaintext pattern used in newer versions

@ -72,18 +72,34 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c
return tmp_size;
}
u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param)
{
const u32 kernel_threads_min = 8;
return kernel_threads_min;
}
u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra)
{
const u32 kernel_threads_max = 8;
return kernel_threads_max;
char *jit_build_options = NULL;
// this uses some nice feedback effect.
// based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value
// which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result.
// therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1.
// note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with:
// Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max)
// on my development system. no clue where the 4 bytes are spent.
// I did some research on this and it seems to be related with the datatype.
// For example, if i used u8 instead, there's only 1 byte wasted.
u32 fixed_local_size = 0;
if (device_param->device_type & CL_DEVICE_TYPE_CPU)
{
fixed_local_size = 1;
}
else
{
fixed_local_size = (device_param->device_local_mem_size - 4) / 4096;
}
hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size);
return jit_build_options;
}
int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len)
@ -236,14 +252,14 @@ void module_init (module_ctx_t *module_ctx)
module_ctx->module_hook23 = MODULE_DEFAULT;
module_ctx->module_hook_salt_size = MODULE_DEFAULT;
module_ctx->module_hook_size = MODULE_DEFAULT;
module_ctx->module_jit_build_options = MODULE_DEFAULT;
module_ctx->module_jit_build_options = module_jit_build_options;
module_ctx->module_jit_cache_disable = MODULE_DEFAULT;
module_ctx->module_kernel_accel_max = MODULE_DEFAULT;
module_ctx->module_kernel_accel_min = MODULE_DEFAULT;
module_ctx->module_kernel_loops_max = MODULE_DEFAULT;
module_ctx->module_kernel_loops_min = MODULE_DEFAULT;
module_ctx->module_kernel_threads_max = module_kernel_threads_max;
module_ctx->module_kernel_threads_min = module_kernel_threads_min;
module_ctx->module_kernel_threads_max = MODULE_DEFAULT;
module_ctx->module_kernel_threads_min = MODULE_DEFAULT;
module_ctx->module_kern_type = module_kern_type;
module_ctx->module_kern_type_dynamic = MODULE_DEFAULT;
module_ctx->module_opti_type = module_opti_type;

Loading…
Cancel
Save