From d4997d12559524699b9cccae6142a08d08e9c5fb Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 31 Jul 2021 16:49:39 +0200 Subject: [PATCH 1/3] Added support for auto-tuning --kernel-threads (-T) on startup --- src/autotune.c | 264 +++++++++++++++++++++++-------------- src/backend.c | 136 ++++++++++--------- src/modules/module_01500.c | 9 +- src/modules/module_03000.c | 9 +- src/modules/module_12500.c | 11 +- src/modules/module_14000.c | 9 +- src/modules/module_14600.c | 13 +- src/modules/module_23700.c | 11 +- src/modules/module_23800.c | 11 +- src/selftest.c | 6 + tools/benchmark_deep.pl | 1 - 11 files changed, 247 insertions(+), 233 deletions(-) diff --git a/src/autotune.c b/src/autotune.c index cdeb73553..d6c406513 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -10,7 +10,7 @@ #include "status.h" #include "autotune.h" -static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) +static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads) { hashconfig_t *hashconfig = hashcat_ctx->hashconfig; user_options_t *user_options = hashcat_ctx->user_options; @@ -19,7 +19,9 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes - u32 kernel_power_try = device_param->hardware_power * kernel_accel; + const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads; + + u32 kernel_power_try = hardware_power * kernel_accel; if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) { @@ -33,6 +35,10 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par } } + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = kernel_threads; + const double spin_damp_sav = device_param->spin_damp; device_param->spin_damp = 0; @@ -50,71 +56,51 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par } else { - run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0); - - if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) - { - run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_try, true, 0); - } - run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0); } device_param->spin_damp = spin_damp_sav; - const double exec_msec_prev = get_avg_exec_time (device_param, 1); - - return exec_msec_prev; -} - -/* -static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) -{ - hashconfig_t *hashconfig = hashcat_ctx->hashconfig; - - device_param->kernel_params_buf32[28] = 0; - device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set - device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes - - const u32 kernel_power_try = device_param->hardware_power * kernel_accel; - - const u32 kernel_threads_sav = device_param->kernel_threads; - - const double spin_damp_sav = device_param->spin_damp; - - device_param->spin_damp = 0; - - if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) - { - if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple1; - - run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0); - } - else - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple4; - - run_kernel (hashcat_ctx, device_param, KERN_RUN_4, 0, kernel_power_try, true, 0); - } - } - else - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple2; - - run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0); - } - device_param->kernel_threads = kernel_threads_sav; - device_param->spin_damp = spin_damp_sav; - const double exec_msec_prev = get_avg_exec_time (device_param, 1); return exec_msec_prev; } -*/ + +static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads, const int times) +{ + double exec_msec_best = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + + for (int i = 1; i < times; i++) + { + double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + + if (exec_msec > exec_msec_best) continue; + + exec_msec_best = exec_msec; + } + + return exec_msec_best; +} + +static u32 previous_power_of_two (const u32 x) +{ + // https://stackoverflow.com/questions/2679815/previous-power-of-2 + // really cool! + + if (x == 0) return 0; + + u32 r = x; + + r |= (r >> 1); + r |= (r >> 2); + r |= (r >> 4); + r |= (r >> 8); + r |= (r >> 16); + + return r - (r >> 1); +} static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { @@ -131,9 +117,57 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param const u32 kernel_loops_min = device_param->kernel_loops_min; const u32 kernel_loops_max = device_param->kernel_loops_max; + const u32 kernel_threads_min = device_param->kernel_threads_min; + const u32 kernel_threads_max = device_param->kernel_threads_max; + u32 kernel_accel = kernel_accel_min; u32 kernel_loops = kernel_loops_min; + // for the threads we take as initial value what we receive from the runtime + // but is only to start with something, we will fine tune this value as soon as we have our workload specified + // this thread limiting is also performed insinde run_kernel() so we need to redo it here, too + + u32 kernel_wgs = 0; + u32 kernel_wgs_multiple = 0; + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) + { + kernel_wgs = device_param->kernel_wgs1; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple1; + } + else + { + kernel_wgs = device_param->kernel_wgs4; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple4; + } + } + else + { + kernel_wgs = device_param->kernel_wgs2; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple2; + } + + u32 kernel_threads = kernel_threads_max; + + if ((kernel_wgs >= kernel_threads_min) && (kernel_wgs <= kernel_threads_max)) + { + kernel_threads = kernel_wgs; + } + + // having a value power of 2 makes it easier to divide + + const u32 kernel_threads_p2 = previous_power_of_two (kernel_threads); + + if ((kernel_threads_p2 >= kernel_threads_min) && (kernel_threads_p2 <= kernel_threads_max)) + { + kernel_threads = kernel_threads_p2; + } + // in this case the user specified a fixed -n and -u on the commandline // no way to tune anything // but we need to run a few caching rounds @@ -149,10 +183,10 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (hashconfig->warmup_disable == false) { - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); } #endif @@ -222,13 +256,37 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } } + // we also need to initialize some values using kernels + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + // nothing to do + } + else + { + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = device_param->kernel_wgs1; + + run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0); + + if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) + { + device_param->kernel_threads = device_param->kernel_wgs2p; + + run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0); + } + + device_param->kernel_threads = kernel_threads_sav; + } + // Do a pre-autotune test run to find out if kernel runtime is above some TDR limit u32 kernel_loops_max_reduced = kernel_loops_max; if (true) { - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min); + double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads); if (exec_msec > 2000) { @@ -237,7 +295,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param return -1; } - exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min); + exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads); const u32 mm = kernel_loops_max / kernel_loops_min; @@ -257,16 +315,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param { if (kernel_loops > kernel_loops_max_reduced) continue; - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops); + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads, 1); if (exec_msec < target_msec) break; } } - // now the same for kernel-accel but with the new kernel-loops from previous loop set - #define STEPS_CNT 16 + // now the same for kernel-accel but with the new kernel-loops from previous loop set + if (kernel_accel_min < kernel_accel_max) { for (int i = 0; i < STEPS_CNT; i++) @@ -276,7 +334,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (kernel_accel_try < kernel_accel_min) continue; if (kernel_accel_try > kernel_accel_max) break; - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops); + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads, 1); if (exec_msec > target_msec) break; @@ -292,7 +350,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param const u32 kernel_accel_orig = kernel_accel; const u32 kernel_loops_orig = kernel_loops; - double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1); for (int i = 1; i < STEPS_CNT; i++) { @@ -307,7 +365,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // do a real test - const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try); + const double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try, kernel_threads, 1); if (exec_msec_prev < exec_msec) break; @@ -324,7 +382,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } } - double exec_msec_pre_final = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + double exec_msec_pre_final = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1); const u32 exec_left = (const u32) (target_msec / exec_msec_pre_final); @@ -339,46 +397,43 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param kernel_accel *= exec_accel_min; } - // start finding best thread count is easier. - // it's either the preferred or the maximum thread count + // v6.2.4 new section: find thread count + // This is not as effective as it could be because of inaccurate kernel return timers + // But is better than fixed values + // Timers in this section are critical, so we rerun meassurements 3 times - /* - const u32 kernel_threads_min = device_param->kernel_threads_min; - const u32 kernel_threads_max = device_param->kernel_threads_max; - - if (kernel_threads_min < kernel_threads_max) + if (kernel_threads_max > kernel_threads_min) { - const double exec_msec_max = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + const u32 kernel_accel_orig = kernel_accel; + const u32 kernel_threads_orig = kernel_threads; - u32 preferred_threads = 0; + double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 3); - if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + for (int i = 1; i < STEPS_CNT; i++) { - if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) - { - preferred_threads = device_param->kernel_preferred_wgs_multiple1; - } - else - { - preferred_threads = device_param->kernel_preferred_wgs_multiple4; - } - } - else - { - preferred_threads = device_param->kernel_preferred_wgs_multiple2; - } + const u32 kernel_accel_try = kernel_accel_orig * (1U << i); + const u32 kernel_threads_try = kernel_threads_orig / (1U << i); - if ((preferred_threads >= kernel_threads_min) && (preferred_threads <= kernel_threads_max)) - { - const double exec_msec_preferred = try_run_preferred (hashcat_ctx, device_param, kernel_accel, kernel_loops); + // since we do not modify total amount of workitems, we can (and need) to do increase kernel_accel_max - if (exec_msec_preferred < exec_msec_max) - { - device_param->kernel_threads = preferred_threads; - } + const u32 kernel_accel_max_try = kernel_accel_max * (1U << i); + + if (kernel_accel_try > kernel_accel_max_try) break; + + if (kernel_threads_try < kernel_threads_min) break; + + if (kernel_threads_try % kernel_wgs_multiple) break; // this would just be waste of time + + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads_try, 3); + + if (exec_msec > exec_msec_prev) continue; + + exec_msec_prev = exec_msec; + + kernel_accel = kernel_accel_try; + kernel_threads = kernel_threads_try; } } - */ } // reset them fake words @@ -478,8 +533,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // store - device_param->kernel_accel = kernel_accel; - device_param->kernel_loops = kernel_loops; + device_param->kernel_accel = kernel_accel; + device_param->kernel_loops = kernel_loops; + device_param->kernel_threads = kernel_threads; + + const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads; + + device_param->hardware_power = hardware_power; const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel; diff --git a/src/backend.c b/src/backend.c index 05cdeeac4..e95793c4f 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5420,6 +5420,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2) == -1) return -1; + if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event1) == -1) return -1; + float exec_ms; if (hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2) == -1) return -1; @@ -9095,6 +9097,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) } } + if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) + { + // they like this + + device_param->kernel_preferred_wgs_multiple = 1; + } + if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) @@ -10113,61 +10122,6 @@ static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, return 0; } -static u32 get_kernel_threads (const hc_device_param_t *device_param) -{ - // this is an upper limit, a good start, since our strategy is to reduce thread counts only. - - u32 kernel_threads_min = device_param->kernel_threads_min; - u32 kernel_threads_max = device_param->kernel_threads_max; - - // the changes we do here are just optimizations, since the module always has priority. - - const u32 device_maxworkgroup_size = (const u32) device_param->device_maxworkgroup_size; - - kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size); - - if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) - { - // for all CPU we just do 1 ... - - kernel_threads_max = MIN (kernel_threads_max, 1); - } - else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) - { - // for GPU we need to distinguish by vendor - - if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) - { - kernel_threads_max = MIN (kernel_threads_max, 8); - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) - { - if (device_param->kernel_preferred_wgs_multiple == 64) - { - // only older AMD GPUs with WaveFront size 64 benefit from this - - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); - } - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) - { - if (device_param->kernel_preferred_wgs_multiple == 64) - { - // only older AMD GPUs with WaveFront size 64 benefit from this - - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); - } - } - } - - // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max. - // in this case we actually want kernel_threads_min selected. - - const u32 kernel_threads = MAX (kernel_threads_min, kernel_threads_max); - - return kernel_threads; -} - static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const char *kernel_name, char *source_file, char *cached_file, const char *build_options_buf, const bool cache_disable, cl_program *opencl_program, CUmodule *cuda_module, hipModule_t *hip_module) { const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; @@ -10206,8 +10160,6 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p if (cached == false) { #if defined (DEBUG) - const user_options_t *user_options = hashcat_ctx->user_options; - if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache. Please be patient...", device_param->device_id + 1, filename_from_filepath (cached_file)); #endif @@ -10460,7 +10412,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p //hiprtc_options[1] = "--device-as-default-execution-space"; //hiprtc_options[2] = "--gpu-architecture"; - hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); + hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); hiprtc_options[1] = "-nocudainc"; hiprtc_options[2] = "-nocudalib"; @@ -11106,6 +11058,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } } + // this seems to work always + + if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) + { + u32 native_threads = 1; + + if ((native_threads >= device_param->kernel_threads_min) && (native_threads <= device_param->kernel_threads_max)) + { + device_param->kernel_threads_min = native_threads; + device_param->kernel_threads_max = native_threads; + } + } + /** * create context for each device */ @@ -11423,7 +11388,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->device_name, device_param->opencl_device_version, device_param->opencl_driver_version, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); md5_ctx_t md5_ctx; @@ -11758,7 +11723,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->vector_width, hashconfig->kern_type, extra_value, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX), + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max, build_options_module_buf); md5_ctx_t md5_ctx; @@ -14483,7 +14448,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) * now everything that depends on threads and accel, basically dynamic workload */ - u32 kernel_threads = get_kernel_threads (device_param); + // u32 kernel_threads = get_kernel_threads (device_param); if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) { @@ -14491,12 +14456,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // in autotune. in this attack mode kernel_power is limited by salts_cnt so we // do not have a lot of options left. - kernel_threads = MIN (kernel_threads, 64); + device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64); + device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64); } - device_param->kernel_threads = kernel_threads; + // device_param->kernel_threads = kernel_threads; + device_param->kernel_threads = 0; - device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * kernel_threads; + device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * device_param->kernel_threads_max; u32 kernel_accel_min = device_param->kernel_accel_min; u32 kernel_accel_max = device_param->kernel_accel_max; @@ -14622,6 +14589,47 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; + // work around, for some reason apple opencl can't have buffers larger 2^31 + // typically runs into trap 6 + // maybe 32/64 bit problem affecting size_t? + + if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) + { + const size_t undocumented_single_allocation_apple = 0x7fffffff; + + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_bfs > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_combs > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_digests > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_esalts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_hooks > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_markov_css > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_plains > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_amp > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_comp > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_idx > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_results > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_root_css > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_rules > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_rules_c > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_salts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_shown > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_tm > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_tmps > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_digests > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_salts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_esalts > undocumented_single_allocation_apple) memory_limit_hit = 1; + } + const u64 size_total = bitmap_ctx->bitmap_size + bitmap_ctx->bitmap_size diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c index dc7b7b47e..ca0e90232 100644 --- a/src/modules/module_01500.c +++ b/src/modules/module_01500.c @@ -95,13 +95,6 @@ int module_build_plain_postprocess (MAYBE_UNUSED const hashconfig_t *hashconfig, return src_len; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -345,7 +338,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_03000.c b/src/modules/module_03000.c index c9b616ab5..cc091a5e5 100644 --- a/src/modules/module_03000.c +++ b/src/modules/module_03000.c @@ -96,13 +96,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY return jit_build_options; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -269,7 +262,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_12500.c b/src/modules/module_12500.c index fda848e0a..a095b8f78 100644 --- a/src/modules/module_12500.c +++ b/src/modules/module_12500.c @@ -88,15 +88,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -256,7 +247,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_14000.c b/src/modules/module_14000.c index 013888bcc..abe8d259e 100644 --- a/src/modules/module_14000.c +++ b/src/modules/module_14000.c @@ -91,13 +91,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY return jit_build_options; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -300,7 +293,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_14600.c b/src/modules/module_14600.c index 0ff26ac67..755fef2ff 100644 --- a/src/modules/module_14600.c +++ b/src/modules/module_14600.c @@ -179,17 +179,6 @@ typedef struct luks_tmp } luks_tmp_t; -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // the module requires a lot of registers for key schedulers on _comp kernel. - // it's possible, if using too many threads, there's not enough registers available, typically ending with misleading error message: - // cuLaunchKernel(): out of memory - - const u32 kernel_threads_max = 64; - - return kernel_threads_max; -} - void *module_benchmark_esalt (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { luks_t *luks = (luks_t *) hcmalloc (sizeof (luks_t)); @@ -655,7 +644,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = MODULE_DEFAULT; module_ctx->module_kernel_loops_min = MODULE_DEFAULT; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = module_kern_type_dynamic; diff --git a/src/modules/module_23700.c b/src/modules/module_23700.c index d3803b0eb..26a24c16f 100644 --- a/src/modules/module_23700.c +++ b/src/modules/module_23700.c @@ -105,15 +105,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -377,7 +368,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_23800.c b/src/modules/module_23800.c index f910b361f..1ca597082 100644 --- a/src/modules/module_23800.c +++ b/src/modules/module_23800.c @@ -388,15 +388,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -654,7 +645,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/selftest.c b/src/selftest.c index 4f8dc1092..2e0b5b000 100644 --- a/src/selftest.c +++ b/src/selftest.c @@ -453,6 +453,10 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // main : run the kernel + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = device_param->kernel_threads_min; + const double spin_damp_sav = device_param->spin_damp; device_param->spin_damp = 0; @@ -677,6 +681,8 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param device_param->spin_damp = spin_damp_sav; + device_param->kernel_threads = kernel_threads_sav; + // check : check if cracked u32 num_cracked = 0; diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index ba2db8856..a481a87e9 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -409,7 +409,6 @@ for my $hash_type (@hash_types) "--wordlist-autohex-disable", "--potfile-disable", "--logfile-disable", - "--hwmon-disable", "--status", "--status-timer", 1, "--runtime", $runtime, From f4e52ca5330c9db3cd40f64460e3002300bf18ec Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 1 Aug 2021 00:04:10 +0200 Subject: [PATCH 2/3] Add new rule function '3' to switch the case of the first letter after occurrence N of char X --- OpenCL/inc_rp.cl | 30 +++++++++++++++ OpenCL/inc_rp.h | 2 + OpenCL/inc_rp_optimized.cl | 77 ++++++++++++++++++++++++++++++++++++++ OpenCL/inc_rp_optimized.h | 2 + docs/rules.txt | 1 + include/types.h | 1 + src/rp.c | 21 +++++++++-- src/rp_cpu.c | 42 +++++++++++++++++++++ 8 files changed, 172 insertions(+), 4 deletions(-) diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 8cecc661d..8ad61b27f 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -269,6 +269,35 @@ DECLSPEC int mangle_toggle_at (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p return (len); } +DECLSPEC int mangle_toggle_at_sep (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len) +{ + if (len >= RP_PASSWORD_SIZE) return (len); + + u8 occurence = 0; + + u32 rem = 0; + + for (int i = 0, idx = 0; i < len; i += 4, idx += 1) + { + const u32 t = buf[idx]; + + buf[idx] = t | generate_cmask (t); + + u32 out = rem; + + rem = 0; + + if (((t >> 0) & 0xff) == p1) { if (occurence == p0) out = 0x0000ff00; occurence++; } + if (((t >> 8) & 0xff) == p1) { if (occurence == p0) out = 0x00ff0000; occurence++; } + if (((t >> 16) & 0xff) == p1) { if (occurence == p0) out = 0xff000000; occurence++; } + if (((t >> 24) & 0xff) == p1) { if (occurence == p0) rem = 0x000000ff; occurence++; } + + buf[idx] = t ^ (generate_cmask (t) & out); + } + + return (len); +} + DECLSPEC int mangle_reverse (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len) { for (int l = 0; l < len / 2; l++) @@ -725,6 +754,7 @@ DECLSPEC int apply_rule (const u32 name, MAYBE_UNUSED const u8 p0, MAYBE_UNUSED case RULE_OP_MANGLE_UREST_LFIRST: out_len = mangle_urest_lfirst (p0, p1, buf, out_len); break; case RULE_OP_MANGLE_TREST: out_len = mangle_trest (p0, p1, buf, out_len); break; case RULE_OP_MANGLE_TOGGLE_AT: out_len = mangle_toggle_at (p0, p1, buf, out_len); break; + case RULE_OP_MANGLE_TOGGLE_AT_SEP: out_len = mangle_toggle_at_sep (p0, p1, buf, out_len); break; case RULE_OP_MANGLE_REVERSE: out_len = mangle_reverse (p0, p1, buf, out_len); break; case RULE_OP_MANGLE_DUPEWORD: out_len = mangle_dupeword (p0, p1, buf, out_len); break; case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = mangle_dupeword_times (p0, p1, (u8 *) buf, out_len); break; diff --git a/OpenCL/inc_rp.h b/OpenCL/inc_rp.h index c13d06e1f..3b91ff9da 100644 --- a/OpenCL/inc_rp.h +++ b/OpenCL/inc_rp.h @@ -21,6 +21,7 @@ #define RULE_OP_MANGLE_UREST_LFIRST 'C' #define RULE_OP_MANGLE_TREST 't' #define RULE_OP_MANGLE_TOGGLE_AT 'T' +#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3' #define RULE_OP_MANGLE_REVERSE 'r' #define RULE_OP_MANGLE_DUPEWORD 'd' #define RULE_OP_MANGLE_DUPEWORD_TIMES 'p' @@ -83,6 +84,7 @@ DECLSPEC int mangle_urest (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u DECLSPEC int mangle_urest_lfirst (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); DECLSPEC int mangle_trest (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); DECLSPEC int mangle_toggle_at (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); +DECLSPEC int mangle_toggle_at_sep (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); DECLSPEC int mangle_reverse (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); DECLSPEC int mangle_dupeword (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u32 *buf, const int len); DECLSPEC int mangle_dupeword_times (MAYBE_UNUSED const u8 p0, MAYBE_UNUSED const u8 p1, u8 *buf, const int len); diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index 026198f09..38239c0ec 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -1163,6 +1163,82 @@ DECLSPEC u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED c return (in_len); } +DECLSPEC u32 rule_op_mangle_toggle_at_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len) +{ + if (in_len == 0) return in_len; + + u32 r0 = search_on_register (buf0[0], p1); + u32 r1 = search_on_register (buf0[1], p1); + u32 r2 = search_on_register (buf0[2], p1); + u32 r3 = search_on_register (buf0[3], p1); + u32 r4 = search_on_register (buf1[0], p1); + u32 r5 = search_on_register (buf1[1], p1); + u32 r6 = search_on_register (buf1[2], p1); + u32 r7 = search_on_register (buf1[3], p1); + + const u32 rn = (r0 << 0) + | (r1 << 4) + | (r2 << 8) + | (r3 << 12) + | (r4 << 16) + | (r5 << 20) + | (r6 << 24) + | (r7 << 28); + + if (rn == 0) return in_len; + + u32 occurence = 0; + + u32 ro = 0; + + #ifdef _unroll + #pragma unroll + #endif + for (int i = 0; i < 32; i++) + { + if ((rn >> i) & 1) + { + if (occurence == p0) + { + ro = 1 << i; + + break; + } + + occurence++; + } + } + + r0 = (ro >> 0) & 15; + r1 = (ro >> 4) & 15; + r2 = (ro >> 8) & 15; + r3 = (ro >> 12) & 15; + r4 = (ro >> 16) & 15; + r5 = (ro >> 20) & 15; + r6 = (ro >> 24) & 15; + r7 = (ro >> 28) & 15; + + r0 <<= 1; + r1 <<= 1; r1 |= r0 >> 4; + r2 <<= 1; r2 |= r1 >> 4; + r3 <<= 1; r3 |= r2 >> 4; + r4 <<= 1; r4 |= r3 >> 4; + r5 <<= 1; r5 |= r4 >> 4; + r6 <<= 1; r6 |= r5 >> 4; + r7 <<= 1; r7 |= r6 >> 4; + + buf0[0] = toggle_on_register (buf0[0], r0); + buf0[1] = toggle_on_register (buf0[1], r1); + buf0[2] = toggle_on_register (buf0[2], r2); + buf0[3] = toggle_on_register (buf0[3], r3); + buf1[0] = toggle_on_register (buf1[0], r4); + buf1[1] = toggle_on_register (buf1[1], r5); + buf1[2] = toggle_on_register (buf1[2], r6); + buf1[3] = toggle_on_register (buf1[3], r7); + + return in_len; +} + DECLSPEC u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len) { reverse_block_optimized (buf0, buf1, buf0, buf1, in_len); @@ -2285,6 +2361,7 @@ DECLSPEC u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u case RULE_OP_MANGLE_UREST_LFIRST: out_len = rule_op_mangle_urest_lfirst (p0, p1, buf0, buf1, out_len); break; case RULE_OP_MANGLE_TREST: out_len = rule_op_mangle_trest (p0, p1, buf0, buf1, out_len); break; case RULE_OP_MANGLE_TOGGLE_AT: out_len = rule_op_mangle_toggle_at (p0, p1, buf0, buf1, out_len); break; + case RULE_OP_MANGLE_TOGGLE_AT_SEP: out_len = rule_op_mangle_toggle_at_sep (p0, p1, buf0, buf1, out_len); break; case RULE_OP_MANGLE_REVERSE: out_len = rule_op_mangle_reverse (p0, p1, buf0, buf1, out_len); break; case RULE_OP_MANGLE_DUPEWORD: out_len = rule_op_mangle_dupeword (p0, p1, buf0, buf1, out_len); break; case RULE_OP_MANGLE_DUPEWORD_TIMES: out_len = rule_op_mangle_dupeword_times (p0, p1, buf0, buf1, out_len); break; diff --git a/OpenCL/inc_rp_optimized.h b/OpenCL/inc_rp_optimized.h index b6a133086..d391d1cfe 100644 --- a/OpenCL/inc_rp_optimized.h +++ b/OpenCL/inc_rp_optimized.h @@ -21,6 +21,7 @@ #define RULE_OP_MANGLE_UREST_LFIRST 'C' #define RULE_OP_MANGLE_TREST 't' #define RULE_OP_MANGLE_TOGGLE_AT 'T' +#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3' #define RULE_OP_MANGLE_REVERSE 'r' #define RULE_OP_MANGLE_DUPEWORD 'd' #define RULE_OP_MANGLE_DUPEWORD_TIMES 'p' @@ -85,6 +86,7 @@ DECLSPEC u32 rule_op_mangle_lrest_ufirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSE DECLSPEC u32 rule_op_mangle_urest_lfirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); DECLSPEC u32 rule_op_mangle_trest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); DECLSPEC u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); +DECLSPEC u32 rule_op_mangle_toggle_at_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); DECLSPEC u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); DECLSPEC u32 rule_op_mangle_dupeword (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); DECLSPEC u32 rule_op_mangle_dupeword_times (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len); diff --git a/docs/rules.txt b/docs/rules.txt index b2621bba4..59233d1ad 100644 --- a/docs/rules.txt +++ b/docs/rules.txt @@ -5,6 +5,7 @@ #define RULE_OP_MANGLE_UREST_LFIRST 'C' // upper case all chars, lower case 1st #define RULE_OP_MANGLE_TREST 't' // switch the case of each char #define RULE_OP_MANGLE_TOGGLE_AT 'T' // switch the case of each char on pos N +#define RULE_OP_MANGLE_TOGGLE_AT_SEP '3' // switch the case of the first letter after occurrence N of char X #define RULE_OP_MANGLE_REVERSE 'r' // reverse word #define RULE_OP_MANGLE_DUPEWORD 'd' // append word to itself #define RULE_OP_MANGLE_DUPEWORD_TIMES 'p' // append word to itself N times diff --git a/include/types.h b/include/types.h index cb4f10f44..98766dfe5 100644 --- a/include/types.h +++ b/include/types.h @@ -295,6 +295,7 @@ typedef enum rule_functions RULE_OP_MANGLE_UREST_LFIRST = 'C', RULE_OP_MANGLE_TREST = 't', RULE_OP_MANGLE_TOGGLE_AT = 'T', + RULE_OP_MANGLE_TOGGLE_AT_SEP = '3', RULE_OP_MANGLE_REVERSE = 'r', RULE_OP_MANGLE_DUPEWORD = 'd', RULE_OP_MANGLE_DUPEWORD_TIMES = 'p', diff --git a/src/rp.c b/src/rp.c index 019b6333c..811546d37 100644 --- a/src/rp.c +++ b/src/rp.c @@ -71,7 +71,8 @@ static const char grp_op_chr_chr[] = static const char grp_op_pos_chr[] = { RULE_OP_MANGLE_INSERT, - RULE_OP_MANGLE_OVERSTRIKE + RULE_OP_MANGLE_OVERSTRIKE, + RULE_OP_MANGLE_TOGGLE_AT_SEP }; static const char grp_op_pos_pos0[] = @@ -444,12 +445,18 @@ int cpu_rule_to_kernel_rule (char *rule_buf, u32 rule_len, kernel_rule_t *rule) break; case RULE_OP_MANGLE_TITLE: - SET_NAME (rule, rule_buf[rule_pos]); + SET_NAME (rule, rule_buf[rule_pos]); break; case RULE_OP_MANGLE_TITLE_SEP: - SET_NAME (rule, rule_buf[rule_pos]); - SET_P0 (rule, rule_buf[rule_pos]); + SET_NAME (rule, rule_buf[rule_pos]); + SET_P0 (rule, rule_buf[rule_pos]); + break; + + case RULE_OP_MANGLE_TOGGLE_AT_SEP: + SET_NAME (rule, rule_buf[rule_pos]); + SET_P0_CONV (rule, rule_buf[rule_pos]); + SET_P1 (rule, rule_buf[rule_pos]); break; default: @@ -675,6 +682,12 @@ int kernel_rule_to_cpu_rule (char *rule_buf, kernel_rule_t *rule) GET_P0 (rule); break; + case RULE_OP_MANGLE_TOGGLE_AT_SEP: + rule_buf[rule_pos] = rule_cmd; + GET_P0_CONV (rule); + GET_P1 (rule); + break; + case 0: if (rule_pos == 0) return -1; return rule_pos - 1; diff --git a/src/rp_cpu.c b/src/rp_cpu.c index b9dc23e69..0fd6265af 100644 --- a/src/rp_cpu.c +++ b/src/rp_cpu.c @@ -45,6 +45,41 @@ static void MANGLE_SWITCH (char *arr, const int l, const int r) arr[l] = c; } +static int mangle_toggle_at_sep (char arr[RP_PASSWORD_SIZE], int arr_len, char c, int upos) +{ + int toggle_next = 0; + + int occurrence = 0; + + int pos; + + for (pos = 0; pos < arr_len; pos++) + { + if (arr[pos] == c) + { + if (occurrence == upos) + { + toggle_next = 1; + } + else + { + occurrence++; + } + + continue; + } + + if (toggle_next == 1) + { + MANGLE_TOGGLE_AT (arr, pos); + + break; + } + } + + return (arr_len); +} + static int mangle_lrest (char arr[RP_PASSWORD_SIZE], int arr_len) { int pos; @@ -561,6 +596,13 @@ int _old_apply_rule (const char *rule, int rule_len, char in[RP_PASSWORD_SIZE], if (upos < out_len) MANGLE_TOGGLE_AT (out, upos); break; + case RULE_OP_MANGLE_TOGGLE_AT_SEP: + NEXT_RULEPOS (rule_pos); + NEXT_RPTOI (rule_new, rule_pos, upos); + NEXT_RULEPOS (rule_pos); + out_len = mangle_toggle_at_sep (out, out_len, rule_new[rule_pos], upos); + break; + case RULE_OP_MANGLE_REVERSE: out_len = mangle_reverse (out, out_len); break; From 6bcbc218d6aec3644c6d893a402639ade48cbfff Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 1 Aug 2021 10:21:21 +0200 Subject: [PATCH 3/3] Fixed out-of-boundary read in input_tokenizer() if the signatures in the hash line is longer than the constant signature in the plugin --- docs/changes.txt | 1 + src/shared.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/changes.txt b/docs/changes.txt index 1ab36e2df..901cc3933 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -14,6 +14,7 @@ - Fixed autotune unitialized tmps variable for slow hashes by calling _init kernel before calling _loop kernel - Fixed datatype in function sha384_hmac_init_vector_128() that could come into effect if vector datatype was manually set - Fixed false negative in all VeraCrypt hash-modes if both conditions are met: 1. use CPU for cracking and 2. PIM range was used +- Fixed out-of-boundary read in input_tokenizer() if the signature in the hash is longer than the length of the plugins' signature constant ## ## Improvements diff --git a/src/shared.c b/src/shared.c index 7efdd4d53..9bff0c646 100644 --- a/src/shared.c +++ b/src/shared.c @@ -1181,7 +1181,7 @@ int input_tokenizer (const u8 *input_buf, const int input_len, token_t *token) for (int signature_idx = 0; signature_idx < token->signatures_cnt; signature_idx++) { - if (memcmp (token->buf[token_idx], token->signatures_buf[signature_idx], token->len[token_idx]) == 0) matched = true; + if (strncmp ((char *) token->buf[token_idx], token->signatures_buf[signature_idx], token->len[token_idx]) == 0) matched = true; } if (matched == false) return (PARSER_SIGNATURE_UNMATCHED);