Introduce a true memset kernel, currently operates on 16 byte per item

pull/352/merge
jsteube 8 years ago
parent fd7bc2736d
commit 2dd8156d24

@ -9322,3 +9322,12 @@ inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4],
#endif
}
__kernel void gpu_memset (__global uint4 *buf, const u32 value, const u32 gid_max)
{
const u32 gid = get_global_id (0);
if (gid >= gid_max) return;
buf[gid] = (uint4) (value);
}

@ -1016,13 +1016,13 @@ struct __hc_device_param
cl_kernel kernel_amp;
cl_kernel kernel_tm;
cl_kernel kernel_weak;
cl_kernel kernel_memset;
cl_context context;
cl_program program;
cl_program program_mp;
cl_program program_amp;
cl_program program_weak;
cl_command_queue command_queue;
@ -1064,6 +1064,7 @@ struct __hc_device_param
void *kernel_params_mp_l[PARAMCNT];
void *kernel_params_amp[PARAMCNT];
void *kernel_params_tm[PARAMCNT];
void *kernel_params_memset[PARAMCNT];
u32 kernel_params_buf32[PARAMCNT];
@ -1077,6 +1078,7 @@ struct __hc_device_param
u64 kernel_params_mp_l_buf64[PARAMCNT];
u32 kernel_params_amp_buf32[PARAMCNT];
u32 kernel_params_memset_buf32[PARAMCNT];
};
typedef struct __hc_device_param hc_device_param_t;

@ -2609,8 +2609,56 @@ static void run_kernel_amp (hc_device_param_t *device_param, const uint num)
hc_clFinish (data.ocl, device_param->command_queue);
}
static void run_kernel_memset (hc_device_param_t *device_param, cl_mem buf, const uint value, const uint num)
{
const u32 num16d = num / 16;
const u32 num16m = num % 16;
if (num16d)
{
device_param->kernel_params_memset_buf32[1] = value;
device_param->kernel_params_memset_buf32[2] = num16d;
uint kernel_threads = device_param->kernel_threads;
uint num_elements = num16d;
while (num_elements % kernel_threads) num_elements++;
cl_kernel kernel = device_param->kernel_memset;
hc_clSetKernelArg (data.ocl, kernel, 0, sizeof (cl_mem), (void *) &buf);
hc_clSetKernelArg (data.ocl, kernel, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]);
hc_clSetKernelArg (data.ocl, kernel, 2, sizeof (cl_uint), device_param->kernel_params_memset[2]);
const size_t global_work_size[3] = { num_elements, 1, 1 };
const size_t local_work_size[3] = { kernel_threads, 1, 1 };
hc_clEnqueueNDRangeKernel (data.ocl, device_param->command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
hc_clFlush (data.ocl, device_param->command_queue);
hc_clFinish (data.ocl, device_param->command_queue);
}
if (num16m)
{
u32 tmp[4];
tmp[0] = value;
tmp[1] = value;
tmp[2] = value;
tmp[3] = value;
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, buf, CL_TRUE, num16d * 16, num16m, tmp, 0, NULL, NULL);
}
}
static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const size_t size)
{
run_kernel_memset (device_param, buf, 0, size);
/*
int rc = -1;
if (device_param->opencl_v12 && device_param->platform_vendor_id == VENDOR_ID_AMD)
@ -2644,6 +2692,7 @@ static void run_kernel_bzero (hc_device_param_t *device_param, cl_mem buf, const
myfree (tmp);
}
*/
}
static void choose_kernel (hc_device_param_t *device_param, const uint attack_exec, const uint attack_mode, const uint opts_type, const salt_t *salt_buf, const uint highest_pw_len, const uint pws_cnt)
@ -2895,6 +2944,14 @@ static void autotune (hc_device_param_t *device_param)
const u32 kernel_power_max = device_param->device_processors * device_param->kernel_threads * kernel_accel_max;
run_kernel_memset (device_param, device_param->d_pws_buf, 7, kernel_power_max * sizeof (pw_t));
if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
{
run_kernel_memset (device_param, device_param->d_pws_amp_buf, 7, kernel_power_max * sizeof (pw_t));
}
/*
for (u32 i = 0; i < kernel_power_max; i++)
{
device_param->pws_buf[i].i[0] = i;
@ -2908,6 +2965,7 @@ static void autotune (hc_device_param_t *device_param)
{
run_kernel_amp (device_param, kernel_power_max);
}
*/
#define VERIFIER_CNT 1
@ -3030,10 +3088,19 @@ static void autotune (hc_device_param_t *device_param)
// reset them fake words
/*
memset (device_param->pws_buf, 0, kernel_power_max * sizeof (pw_t));
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_buf, CL_TRUE, 0, kernel_power_max * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_pws_amp_buf, CL_TRUE, 0, kernel_power_max * sizeof (pw_t), device_param->pws_buf, 0, NULL, NULL);
*/
run_kernel_memset (device_param, device_param->d_pws_buf, 0, kernel_power_max * sizeof (pw_t));
if (data.attack_exec == ATTACK_EXEC_OUTSIDE_KERNEL)
{
run_kernel_memset (device_param, device_param->d_pws_amp_buf, 0, kernel_power_max * sizeof (pw_t));
}
// reset timer
@ -15084,13 +15151,6 @@ int main (int argc, char **argv)
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_digests_shown, CL_TRUE, 0, size_shown, data.digests_shown, 0, NULL, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_salt_bufs, CL_TRUE, 0, size_salts, data.salts_buf, 0, NULL, NULL);
run_kernel_bzero (device_param, device_param->d_pws_buf, size_pws);
run_kernel_bzero (device_param, device_param->d_pws_amp_buf, size_pws);
run_kernel_bzero (device_param, device_param->d_tmps, size_tmps);
run_kernel_bzero (device_param, device_param->d_hooks, size_hooks);
run_kernel_bzero (device_param, device_param->d_plain_bufs, size_plains);
run_kernel_bzero (device_param, device_param->d_result, size_results);
/**
* special buffers
*/
@ -15101,8 +15161,6 @@ int main (int argc, char **argv)
device_param->d_rules_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_rules_c, NULL);
hc_clEnqueueWriteBuffer (data.ocl, device_param->command_queue, device_param->d_rules, CL_TRUE, 0, size_rules, kernel_rules_buf, 0, NULL, NULL);
run_kernel_bzero (device_param, device_param->d_rules_c, size_rules_c);
}
else if (attack_kern == ATTACK_KERN_COMBI)
{
@ -15110,11 +15168,6 @@ int main (int argc, char **argv)
device_param->d_combs_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_combs, NULL);
device_param->d_root_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_root_css, NULL);
device_param->d_markov_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL);
run_kernel_bzero (device_param, device_param->d_combs, size_combs);
run_kernel_bzero (device_param, device_param->d_combs_c, size_combs);
run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css);
run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
}
else if (attack_kern == ATTACK_KERN_BF)
{
@ -15123,12 +15176,6 @@ int main (int argc, char **argv)
device_param->d_tm_c = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_tm, NULL);
device_param->d_root_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_root_css, NULL);
device_param->d_markov_css_buf = hc_clCreateBuffer (data.ocl, device_param->context, CL_MEM_READ_ONLY, size_markov_css, NULL);
run_kernel_bzero (device_param, device_param->d_bfs, size_bfs);
run_kernel_bzero (device_param, device_param->d_bfs_c, size_bfs);
run_kernel_bzero (device_param, device_param->d_tm_c, size_tm);
run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css);
run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
}
if (size_esalts)
@ -15272,6 +15319,13 @@ int main (int argc, char **argv)
device_param->kernel_params_tm[0] = &device_param->d_bfs_c;
device_param->kernel_params_tm[1] = &device_param->d_tm_c;
device_param->kernel_params_memset_buf32[1] = 0; // value
device_param->kernel_params_memset_buf32[2] = 0; // gid_max
device_param->kernel_params_memset[0] = NULL;
device_param->kernel_params_memset[1] = &device_param->kernel_params_memset_buf32[1];
device_param->kernel_params_memset[2] = &device_param->kernel_params_memset_buf32[2];
/**
* kernel name
*/
@ -15380,6 +15434,18 @@ int main (int argc, char **argv)
if (opts_type & OPTS_TYPE_HOOK23) hc_clSetKernelArg (data.ocl, device_param->kernel23, i, sizeof (cl_uint), device_param->kernel_params[i]);
}
// GPU memset
device_param->kernel_memset = hc_clCreateKernel (data.ocl, device_param->program, "gpu_memset");
hc_clGetKernelWorkGroupInfo (data.ocl, device_param->kernel_memset, device_param->device, CL_KERNEL_WORK_GROUP_SIZE, sizeof (size_t), &kernel_wgs_tmp, NULL); kernel_threads = MIN (kernel_threads, kernel_wgs_tmp);
hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 0, sizeof (cl_mem), device_param->kernel_params_memset[0]);
hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 1, sizeof (cl_uint), device_param->kernel_params_memset[1]);
hc_clSetKernelArg (data.ocl, device_param->kernel_memset, 2, sizeof (cl_uint), device_param->kernel_params_memset[2]);
// MP start
if (attack_mode == ATTACK_MODE_BF)
{
device_param->kernel_mp_l = hc_clCreateKernel (data.ocl, device_param->program_mp, "l_markov");
@ -15440,6 +15506,39 @@ int main (int argc, char **argv)
device_param->kernel_threads = kernel_threads;
// zero some data buffers
run_kernel_bzero (device_param, device_param->d_pws_buf, size_pws);
run_kernel_bzero (device_param, device_param->d_pws_amp_buf, size_pws);
run_kernel_bzero (device_param, device_param->d_tmps, size_tmps);
run_kernel_bzero (device_param, device_param->d_hooks, size_hooks);
run_kernel_bzero (device_param, device_param->d_plain_bufs, size_plains);
run_kernel_bzero (device_param, device_param->d_result, size_results);
/**
* special buffers
*/
if (attack_kern == ATTACK_KERN_STRAIGHT)
{
run_kernel_bzero (device_param, device_param->d_rules_c, size_rules_c);
}
else if (attack_kern == ATTACK_KERN_COMBI)
{
run_kernel_bzero (device_param, device_param->d_combs, size_combs);
run_kernel_bzero (device_param, device_param->d_combs_c, size_combs);
run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css);
run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
}
else if (attack_kern == ATTACK_KERN_BF)
{
run_kernel_bzero (device_param, device_param->d_bfs, size_bfs);
run_kernel_bzero (device_param, device_param->d_bfs_c, size_bfs);
run_kernel_bzero (device_param, device_param->d_tm_c, size_tm);
run_kernel_bzero (device_param, device_param->d_root_css_buf, size_root_css);
run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css);
}
/**
* Store initial fanspeed if gpu_temp_retain is enabled
*/

Loading…
Cancel
Save