diff --git a/src/autotune.c b/src/autotune.c index cdeb73553..d6c406513 100644 --- a/src/autotune.c +++ b/src/autotune.c @@ -10,7 +10,7 @@ #include "status.h" #include "autotune.h" -static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) +static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads) { hashconfig_t *hashconfig = hashcat_ctx->hashconfig; user_options_t *user_options = hashcat_ctx->user_options; @@ -19,7 +19,9 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes - u32 kernel_power_try = device_param->hardware_power * kernel_accel; + const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * kernel_threads; + + u32 kernel_power_try = hardware_power * kernel_accel; if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) { @@ -33,6 +35,10 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par } } + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = kernel_threads; + const double spin_damp_sav = device_param->spin_damp; device_param->spin_damp = 0; @@ -50,71 +56,51 @@ static double try_run (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_par } else { - run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0); - - if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) - { - run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_try, true, 0); - } - run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0); } device_param->spin_damp = spin_damp_sav; + device_param->kernel_threads = kernel_threads_sav; + const double exec_msec_prev = get_avg_exec_time (device_param, 1); return exec_msec_prev; } -/* -static double try_run_preferred (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops) +static double try_run_times (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const u32 kernel_accel, const u32 kernel_loops, const u32 kernel_threads, const int times) { - hashconfig_t *hashconfig = hashcat_ctx->hashconfig; - - device_param->kernel_params_buf32[28] = 0; - device_param->kernel_params_buf32[29] = kernel_loops; // not a bug, both need to be set - device_param->kernel_params_buf32[30] = kernel_loops; // because there's two variables for inner iters for slow and fast hashes + double exec_msec_best = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); - const u32 kernel_power_try = device_param->hardware_power * kernel_accel; - - const u32 kernel_threads_sav = device_param->kernel_threads; - - const double spin_damp_sav = device_param->spin_damp; - - device_param->spin_damp = 0; - - if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + for (int i = 1; i < times; i++) { - if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple1; + double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); - run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_try, true, 0); - } - else - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple4; + if (exec_msec > exec_msec_best) continue; - run_kernel (hashcat_ctx, device_param, KERN_RUN_4, 0, kernel_power_try, true, 0); - } + exec_msec_best = exec_msec; } - else - { - device_param->kernel_threads = device_param->kernel_preferred_wgs_multiple2; - run_kernel (hashcat_ctx, device_param, KERN_RUN_2, 0, kernel_power_try, true, 0); - } + return exec_msec_best; +} - device_param->kernel_threads = kernel_threads_sav; +static u32 previous_power_of_two (const u32 x) +{ + // https://stackoverflow.com/questions/2679815/previous-power-of-2 + // really cool! - device_param->spin_damp = spin_damp_sav; + if (x == 0) return 0; - const double exec_msec_prev = get_avg_exec_time (device_param, 1); + u32 r = x; - return exec_msec_prev; + r |= (r >> 1); + r |= (r >> 2); + r |= (r >> 4); + r |= (r >> 8); + r |= (r >> 16); + + return r - (r >> 1); } -*/ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param) { @@ -131,9 +117,57 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param const u32 kernel_loops_min = device_param->kernel_loops_min; const u32 kernel_loops_max = device_param->kernel_loops_max; + const u32 kernel_threads_min = device_param->kernel_threads_min; + const u32 kernel_threads_max = device_param->kernel_threads_max; + u32 kernel_accel = kernel_accel_min; u32 kernel_loops = kernel_loops_min; + // for the threads we take as initial value what we receive from the runtime + // but is only to start with something, we will fine tune this value as soon as we have our workload specified + // this thread limiting is also performed insinde run_kernel() so we need to redo it here, too + + u32 kernel_wgs = 0; + u32 kernel_wgs_multiple = 0; + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) + { + kernel_wgs = device_param->kernel_wgs1; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple1; + } + else + { + kernel_wgs = device_param->kernel_wgs4; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple4; + } + } + else + { + kernel_wgs = device_param->kernel_wgs2; + + kernel_wgs_multiple = device_param->kernel_preferred_wgs_multiple2; + } + + u32 kernel_threads = kernel_threads_max; + + if ((kernel_wgs >= kernel_threads_min) && (kernel_wgs <= kernel_threads_max)) + { + kernel_threads = kernel_wgs; + } + + // having a value power of 2 makes it easier to divide + + const u32 kernel_threads_p2 = previous_power_of_two (kernel_threads); + + if ((kernel_threads_p2 >= kernel_threads_min) && (kernel_threads_p2 <= kernel_threads_max)) + { + kernel_threads = kernel_threads_p2; + } + // in this case the user specified a fixed -n and -u on the commandline // no way to tune anything // but we need to run a few caching rounds @@ -149,10 +183,10 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (hashconfig->warmup_disable == false) { - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); - try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); + try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads); } #endif @@ -222,13 +256,37 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } } + // we also need to initialize some values using kernels + + if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) + { + // nothing to do + } + else + { + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = device_param->kernel_wgs1; + + run_kernel (hashcat_ctx, device_param, KERN_RUN_1, 0, kernel_power_max, false, 0); + + if (hashconfig->opts_type & OPTS_TYPE_LOOP_PREPARE) + { + device_param->kernel_threads = device_param->kernel_wgs2p; + + run_kernel (hashcat_ctx, device_param, KERN_RUN_2P, 0, kernel_power_max, false, 0); + } + + device_param->kernel_threads = kernel_threads_sav; + } + // Do a pre-autotune test run to find out if kernel runtime is above some TDR limit u32 kernel_loops_max_reduced = kernel_loops_max; if (true) { - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min); + double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads); if (exec_msec > 2000) { @@ -237,7 +295,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param return -1; } - exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min); + exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops_min, kernel_threads); const u32 mm = kernel_loops_max / kernel_loops_min; @@ -257,16 +315,16 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param { if (kernel_loops > kernel_loops_max_reduced) continue; - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_min, kernel_loops); + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_min, kernel_loops, kernel_threads, 1); if (exec_msec < target_msec) break; } } - // now the same for kernel-accel but with the new kernel-loops from previous loop set - #define STEPS_CNT 16 + // now the same for kernel-accel but with the new kernel-loops from previous loop set + if (kernel_accel_min < kernel_accel_max) { for (int i = 0; i < STEPS_CNT; i++) @@ -276,7 +334,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param if (kernel_accel_try < kernel_accel_min) continue; if (kernel_accel_try > kernel_accel_max) break; - double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops); + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads, 1); if (exec_msec > target_msec) break; @@ -292,7 +350,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param const u32 kernel_accel_orig = kernel_accel; const u32 kernel_loops_orig = kernel_loops; - double exec_msec_prev = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1); for (int i = 1; i < STEPS_CNT; i++) { @@ -307,7 +365,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // do a real test - const double exec_msec = try_run (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try); + const double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops_try, kernel_threads, 1); if (exec_msec_prev < exec_msec) break; @@ -324,7 +382,7 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param } } - double exec_msec_pre_final = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + double exec_msec_pre_final = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 1); const u32 exec_left = (const u32) (target_msec / exec_msec_pre_final); @@ -339,46 +397,43 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param kernel_accel *= exec_accel_min; } - // start finding best thread count is easier. - // it's either the preferred or the maximum thread count + // v6.2.4 new section: find thread count + // This is not as effective as it could be because of inaccurate kernel return timers + // But is better than fixed values + // Timers in this section are critical, so we rerun meassurements 3 times - /* - const u32 kernel_threads_min = device_param->kernel_threads_min; - const u32 kernel_threads_max = device_param->kernel_threads_max; - - if (kernel_threads_min < kernel_threads_max) + if (kernel_threads_max > kernel_threads_min) { - const double exec_msec_max = try_run (hashcat_ctx, device_param, kernel_accel, kernel_loops); + const u32 kernel_accel_orig = kernel_accel; + const u32 kernel_threads_orig = kernel_threads; - u32 preferred_threads = 0; + double exec_msec_prev = try_run_times (hashcat_ctx, device_param, kernel_accel, kernel_loops, kernel_threads, 3); - if (hashconfig->attack_exec == ATTACK_EXEC_INSIDE_KERNEL) - { - if (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL) - { - preferred_threads = device_param->kernel_preferred_wgs_multiple1; - } - else - { - preferred_threads = device_param->kernel_preferred_wgs_multiple4; - } - } - else + for (int i = 1; i < STEPS_CNT; i++) { - preferred_threads = device_param->kernel_preferred_wgs_multiple2; - } + const u32 kernel_accel_try = kernel_accel_orig * (1U << i); + const u32 kernel_threads_try = kernel_threads_orig / (1U << i); - if ((preferred_threads >= kernel_threads_min) && (preferred_threads <= kernel_threads_max)) - { - const double exec_msec_preferred = try_run_preferred (hashcat_ctx, device_param, kernel_accel, kernel_loops); + // since we do not modify total amount of workitems, we can (and need) to do increase kernel_accel_max - if (exec_msec_preferred < exec_msec_max) - { - device_param->kernel_threads = preferred_threads; - } + const u32 kernel_accel_max_try = kernel_accel_max * (1U << i); + + if (kernel_accel_try > kernel_accel_max_try) break; + + if (kernel_threads_try < kernel_threads_min) break; + + if (kernel_threads_try % kernel_wgs_multiple) break; // this would just be waste of time + + double exec_msec = try_run_times (hashcat_ctx, device_param, kernel_accel_try, kernel_loops, kernel_threads_try, 3); + + if (exec_msec > exec_msec_prev) continue; + + exec_msec_prev = exec_msec; + + kernel_accel = kernel_accel_try; + kernel_threads = kernel_threads_try; } } - */ } // reset them fake words @@ -478,8 +533,13 @@ static int autotune (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // store - device_param->kernel_accel = kernel_accel; - device_param->kernel_loops = kernel_loops; + device_param->kernel_accel = kernel_accel; + device_param->kernel_loops = kernel_loops; + device_param->kernel_threads = kernel_threads; + + const u32 hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_param->device_processors) * device_param->kernel_threads; + + device_param->hardware_power = hardware_power; const u32 kernel_power = device_param->hardware_power * device_param->kernel_accel; diff --git a/src/backend.c b/src/backend.c index 05cdeeac4..e95793c4f 100644 --- a/src/backend.c +++ b/src/backend.c @@ -5420,6 +5420,8 @@ int run_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, con if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event2) == -1) return -1; + if (hc_cuEventSynchronize (hashcat_ctx, device_param->cuda_event1) == -1) return -1; + float exec_ms; if (hc_cuEventElapsedTime (hashcat_ctx, &exec_ms, device_param->cuda_event1, device_param->cuda_event2) == -1) return -1; @@ -9095,6 +9097,13 @@ int backend_ctx_devices_init (hashcat_ctx_t *hashcat_ctx, const int comptime) } } + if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) + { + // they like this + + device_param->kernel_preferred_wgs_multiple = 1; + } + if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) { if ((device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) && (device_param->opencl_device_vendor_id == VENDOR_ID_AMD)) @@ -10113,61 +10122,6 @@ static int get_opencl_kernel_dynamic_local_mem_size (hashcat_ctx_t *hashcat_ctx, return 0; } -static u32 get_kernel_threads (const hc_device_param_t *device_param) -{ - // this is an upper limit, a good start, since our strategy is to reduce thread counts only. - - u32 kernel_threads_min = device_param->kernel_threads_min; - u32 kernel_threads_max = device_param->kernel_threads_max; - - // the changes we do here are just optimizations, since the module always has priority. - - const u32 device_maxworkgroup_size = (const u32) device_param->device_maxworkgroup_size; - - kernel_threads_max = MIN (kernel_threads_max, device_maxworkgroup_size); - - if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) - { - // for all CPU we just do 1 ... - - kernel_threads_max = MIN (kernel_threads_max, 1); - } - else if (device_param->opencl_device_type & CL_DEVICE_TYPE_GPU) - { - // for GPU we need to distinguish by vendor - - if (device_param->opencl_device_vendor_id == VENDOR_ID_INTEL_SDK) - { - kernel_threads_max = MIN (kernel_threads_max, 8); - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD) - { - if (device_param->kernel_preferred_wgs_multiple == 64) - { - // only older AMD GPUs with WaveFront size 64 benefit from this - - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); - } - } - else if (device_param->opencl_device_vendor_id == VENDOR_ID_AMD_USE_HIP) - { - if (device_param->kernel_preferred_wgs_multiple == 64) - { - // only older AMD GPUs with WaveFront size 64 benefit from this - - kernel_threads_max = MIN (kernel_threads_max, device_param->kernel_preferred_wgs_multiple); - } - } - } - - // this is intenionally! at this point, kernel_threads_min can be higher than kernel_threads_max. - // in this case we actually want kernel_threads_min selected. - - const u32 kernel_threads = MAX (kernel_threads_min, kernel_threads_max); - - return kernel_threads; -} - static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param, const char *kernel_name, char *source_file, char *cached_file, const char *build_options_buf, const bool cache_disable, cl_program *opencl_program, CUmodule *cuda_module, hipModule_t *hip_module) { const hashconfig_t *hashconfig = hashcat_ctx->hashconfig; @@ -10206,8 +10160,6 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p if (cached == false) { #if defined (DEBUG) - const user_options_t *user_options = hashcat_ctx->user_options; - if (user_options->quiet == false) event_log_warning (hashcat_ctx, "* Device #%u: Kernel %s not found in cache. Please be patient...", device_param->device_id + 1, filename_from_filepath (cached_file)); #endif @@ -10460,7 +10412,7 @@ static bool load_kernel (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_p //hiprtc_options[1] = "--device-as-default-execution-space"; //hiprtc_options[2] = "--gpu-architecture"; - hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); + hc_asprintf (&hiprtc_options[0], "--gpu-max-threads-per-block=%d", (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); hiprtc_options[1] = "-nocudainc"; hiprtc_options[2] = "-nocudalib"; @@ -11106,6 +11058,19 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) } } + // this seems to work always + + if (device_param->opencl_device_type & CL_DEVICE_TYPE_CPU) + { + u32 native_threads = 1; + + if ((native_threads >= device_param->kernel_threads_min) && (native_threads <= device_param->kernel_threads_max)) + { + device_param->kernel_threads_min = native_threads; + device_param->kernel_threads_max = native_threads; + } + } + /** * create context for each device */ @@ -11423,7 +11388,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->device_name, device_param->opencl_device_version, device_param->opencl_driver_version, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX)); + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max); md5_ctx_t md5_ctx; @@ -11758,7 +11723,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) device_param->vector_width, hashconfig->kern_type, extra_value, - (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : ((device_param->kernel_preferred_wgs_multiple == 64) ? 64 : KERNEL_THREADS_MAX), + (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : device_param->kernel_threads_max, build_options_module_buf); md5_ctx_t md5_ctx; @@ -14483,7 +14448,7 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) * now everything that depends on threads and accel, basically dynamic workload */ - u32 kernel_threads = get_kernel_threads (device_param); + // u32 kernel_threads = get_kernel_threads (device_param); if (user_options->attack_mode == ATTACK_MODE_ASSOCIATION) { @@ -14491,12 +14456,14 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) // in autotune. in this attack mode kernel_power is limited by salts_cnt so we // do not have a lot of options left. - kernel_threads = MIN (kernel_threads, 64); + device_param->kernel_threads_min = MIN (device_param->kernel_threads_min, 64); + device_param->kernel_threads_max = MIN (device_param->kernel_threads_max, 64); } - device_param->kernel_threads = kernel_threads; + // device_param->kernel_threads = kernel_threads; + device_param->kernel_threads = 0; - device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * kernel_threads; + device_param->hardware_power = ((hashconfig->opts_type & OPTS_TYPE_MP_MULTI_DISABLE) ? 1 : device_processors) * device_param->kernel_threads_max; u32 kernel_accel_min = device_param->kernel_accel_min; u32 kernel_accel_max = device_param->kernel_accel_max; @@ -14622,6 +14589,47 @@ int backend_session_begin (hashcat_ctx_t *hashcat_ctx) if ((size_tmps + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; if ((size_hooks + EXTRA_SPACE) > device_param->device_maxmem_alloc) memory_limit_hit = 1; + // work around, for some reason apple opencl can't have buffers larger 2^31 + // typically runs into trap 6 + // maybe 32/64 bit problem affecting size_t? + + if (device_param->opencl_platform_vendor_id == VENDOR_ID_APPLE) + { + const size_t undocumented_single_allocation_apple = 0x7fffffff; + + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (bitmap_ctx->bitmap_size > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_bfs > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_combs > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_digests > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_esalts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_hooks > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_markov_css > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_plains > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_amp > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_comp > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_pws_idx > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_results > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_root_css > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_rules > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_rules_c > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_salts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_extra_buffer > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_shown > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_tm > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_tmps > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_digests > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_salts > undocumented_single_allocation_apple) memory_limit_hit = 1; + if (size_st_esalts > undocumented_single_allocation_apple) memory_limit_hit = 1; + } + const u64 size_total = bitmap_ctx->bitmap_size + bitmap_ctx->bitmap_size diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c index dc7b7b47e..ca0e90232 100644 --- a/src/modules/module_01500.c +++ b/src/modules/module_01500.c @@ -95,13 +95,6 @@ int module_build_plain_postprocess (MAYBE_UNUSED const hashconfig_t *hashconfig, return src_len; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -345,7 +338,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_03000.c b/src/modules/module_03000.c index c9b616ab5..cc091a5e5 100644 --- a/src/modules/module_03000.c +++ b/src/modules/module_03000.c @@ -96,13 +96,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY return jit_build_options; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -269,7 +262,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_12500.c b/src/modules/module_12500.c index fda848e0a..a095b8f78 100644 --- a/src/modules/module_12500.c +++ b/src/modules/module_12500.c @@ -88,15 +88,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -256,7 +247,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_14000.c b/src/modules/module_14000.c index 013888bcc..abe8d259e 100644 --- a/src/modules/module_14000.c +++ b/src/modules/module_14000.c @@ -91,13 +91,6 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY return jit_build_options; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 64; // performance only optimization - - return kernel_threads_max; -} - u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { u32 kernel_loops_max = KERNEL_LOOPS_MAX; @@ -300,7 +293,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_14600.c b/src/modules/module_14600.c index 0ff26ac67..755fef2ff 100644 --- a/src/modules/module_14600.c +++ b/src/modules/module_14600.c @@ -179,17 +179,6 @@ typedef struct luks_tmp } luks_tmp_t; -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // the module requires a lot of registers for key schedulers on _comp kernel. - // it's possible, if using too many threads, there's not enough registers available, typically ending with misleading error message: - // cuLaunchKernel(): out of memory - - const u32 kernel_threads_max = 64; - - return kernel_threads_max; -} - void *module_benchmark_esalt (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { luks_t *luks = (luks_t *) hcmalloc (sizeof (luks_t)); @@ -655,7 +644,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = MODULE_DEFAULT; module_ctx->module_kernel_loops_min = MODULE_DEFAULT; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = module_kern_type_dynamic; diff --git a/src/modules/module_23700.c b/src/modules/module_23700.c index d3803b0eb..26a24c16f 100644 --- a/src/modules/module_23700.c +++ b/src/modules/module_23700.c @@ -105,15 +105,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -377,7 +368,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/modules/module_23800.c b/src/modules/module_23800.c index f910b361f..1ca597082 100644 --- a/src/modules/module_23800.c +++ b/src/modules/module_23800.c @@ -388,15 +388,6 @@ u32 module_kernel_loops_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_ return kernel_loops_max; } -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - // -T 128 works slightly faster but it's free for the user to change - - const u32 kernel_threads_max = (user_options->kernel_threads_chgd == true) ? user_options->kernel_threads : 128; - - return kernel_threads_max; -} - u32 module_pw_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) { const bool optimized_kernel = (hashconfig->opti_type & OPTI_TYPE_OPTIMIZED_KERNEL); @@ -654,7 +645,7 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = module_kernel_loops_max; module_ctx->module_kernel_loops_min = module_kernel_loops_min; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; diff --git a/src/selftest.c b/src/selftest.c index 4f8dc1092..2e0b5b000 100644 --- a/src/selftest.c +++ b/src/selftest.c @@ -453,6 +453,10 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param // main : run the kernel + const u32 kernel_threads_sav = device_param->kernel_threads; + + device_param->kernel_threads = device_param->kernel_threads_min; + const double spin_damp_sav = device_param->spin_damp; device_param->spin_damp = 0; @@ -677,6 +681,8 @@ static int selftest (hashcat_ctx_t *hashcat_ctx, hc_device_param_t *device_param device_param->spin_damp = spin_damp_sav; + device_param->kernel_threads = kernel_threads_sav; + // check : check if cracked u32 num_cracked = 0; diff --git a/tools/benchmark_deep.pl b/tools/benchmark_deep.pl index ba2db8856..a481a87e9 100755 --- a/tools/benchmark_deep.pl +++ b/tools/benchmark_deep.pl @@ -409,7 +409,6 @@ for my $hash_type (@hash_types) "--wordlist-autohex-disable", "--potfile-disable", "--logfile-disable", - "--hwmon-disable", "--status", "--status-timer", 1, "--runtime", $runtime,