From fb82bfc169752c6dfc6cd20d6d97f649cf7c2992 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Wed, 8 May 2019 23:30:07 +0200 Subject: [PATCH] Improve thread handling based on FIXED_LOCAL_SIZE --- src/backend.c | 40 ++++++++++++-------------------------- src/modules/module_03200.c | 7 ------- src/modules/module_09000.c | 37 +++++++++++++++++++++++++++-------- src/modules/module_18600.c | 37 +++++++++++++++++++++++++++-------- 4 files changed, 70 insertions(+), 51 deletions(-) diff --git a/src/backend.c b/src/backend.c index c5ea4bf1f..ed88b191f 100644 --- a/src/backend.c +++ b/src/backend.c @@ -3093,33 +3093,6 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con if (device_param->is_cuda == true) { - u64 local_mem_size = 0; - - switch (kern_run) - { - case KERN_RUN_1: local_mem_size = device_param->kernel_local_mem_size1; break; - case KERN_RUN_12: local_mem_size = device_param->kernel_local_mem_size12; break; - case KERN_RUN_2: local_mem_size = device_param->kernel_local_mem_size2; break; - case KERN_RUN_23: local_mem_size = device_param->kernel_local_mem_size23; break; - case KERN_RUN_3: local_mem_size = device_param->kernel_local_mem_size3; break; - case KERN_RUN_4: local_mem_size = device_param->kernel_local_mem_size4; break; - case KERN_RUN_INIT2: local_mem_size = device_param->kernel_local_mem_size_init2; break; - case KERN_RUN_LOOP2: local_mem_size = device_param->kernel_local_mem_size_loop2; break; - case KERN_RUN_AUX1: local_mem_size = device_param->kernel_local_mem_size_aux1; break; - case KERN_RUN_AUX2: local_mem_size = device_param->kernel_local_mem_size_aux2; break; - case KERN_RUN_AUX3: local_mem_size = device_param->kernel_local_mem_size_aux3; break; - case KERN_RUN_AUX4: local_mem_size = device_param->kernel_local_mem_size_aux4; break; - } - - /* - if (local_mem_size) - { - const u32 max_threads_possible = (device_param->device_local_mem_size - 240) / local_mem_size; - - kernel_threads = MIN (kernel_threads, max_threads_possible); - } - */ - CUfunction cuda_function = NULL; if (device_param->is_cuda == true) @@ -7039,7 +7012,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } } - // there's not thread column in tuning db, stick to commandline if defined + // there's no thread column in tuning db, stick to commandline if defined if (user_options->kernel_threads_chgd == true) { @@ -7291,6 +7264,17 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if (jit_build_options != NULL) { build_options_module_len += snprintf (build_options_module_buf + build_options_module_len, build_options_sz - build_options_module_len, "%s", jit_build_options); + + // this is a bit ugly + // would be better to have the module return the value as value + + u32 fixed_local_size = 0; + + if (sscanf (jit_build_options, "-D FIXED_LOCAL_SIZE=%u", &fixed_local_size) == 1) + { + device_param->kernel_threads_min = fixed_local_size; + device_param->kernel_threads_max = fixed_local_size; + } } } diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index 6cd15c7c7..b0b35b627 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -108,13 +108,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { overhead = 4; } - - // no clue yet where this is coming from - - if (device_param->is_cuda == true) - { - overhead = 240; - } } if (user_options->kernel_threads_chgd == true) diff --git a/src/modules/module_09000.c b/src/modules/module_09000.c index 8817fd4b6..e8cdac075 100644 --- a/src/modules/module_09000.c +++ b/src/modules/module_09000.c @@ -74,6 +74,11 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { char *jit_build_options = NULL; + // this uses some nice feedback effect. + // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value + // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result. + // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1. + u32 fixed_local_size = 0; if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) @@ -82,19 +87,35 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY } else { - if (user_options->kernel_threads_chgd == true) - { - fixed_local_size = user_options->kernel_threads; - } - else - { - u32 overhead = 0; + u32 overhead = 0; - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) + if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) + { + // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with: + // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max) + // on my development system. no clue where the 4 bytes are spent. + // I did some research on this and it seems to be related with the datatype. + // For example, if i used u8 instead, there's only 1 byte wasted. + + if (device_param->is_opencl == true) { overhead = 4; } + } + if (user_options->kernel_threads_chgd == true) + { + fixed_local_size = user_options->kernel_threads; + + // otherwise out-of-bound reads + + if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + { + fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + } + } + else + { fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; } } diff --git a/src/modules/module_18600.c b/src/modules/module_18600.c index 109a3f65c..663717538 100644 --- a/src/modules/module_18600.c +++ b/src/modules/module_18600.c @@ -66,6 +66,11 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { char *jit_build_options = NULL; + // this uses some nice feedback effect. + // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value + // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result. + // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1. + u32 fixed_local_size = 0; if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) @@ -74,19 +79,35 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY } else { - if (user_options->kernel_threads_chgd == true) - { - fixed_local_size = user_options->kernel_threads; - } - else - { - u32 overhead = 0; + u32 overhead = 0; - if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) + if (device_param->opencl_device_vendor_id == VENDOR_ID_NV) + { + // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with: + // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max) + // on my development system. no clue where the 4 bytes are spent. + // I did some research on this and it seems to be related with the datatype. + // For example, if i used u8 instead, there's only 1 byte wasted. + + if (device_param->is_opencl == true) { overhead = 4; } + } + if (user_options->kernel_threads_chgd == true) + { + fixed_local_size = user_options->kernel_threads; + + // otherwise out-of-bound reads + + if ((fixed_local_size * 4096) > (device_param->device_local_mem_size - overhead)) + { + fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; + } + } + else + { fixed_local_size = (device_param->device_local_mem_size - overhead) / 4096; } }