From 3e4badd9b4fc2300fe5c5ddf4b3335a5afffc4d7 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sun, 29 May 2016 00:59:24 +0200 Subject: [PATCH] Send a notice to user in case the drivers temperature threshold for slowdown is reached and a performance drop is expected due to throttling --- include/ext_nvml.h | 12 +++++++ include/shared.h | 11 ++++--- include/types.h | 7 ++-- src/ext_nvml.c | 18 +++++++++++ src/hashcat.c | 80 ++++++++++++++++++++++++++++++++++++++++------ src/shared.c | 29 ++++++++++++++++- 6 files changed, 139 insertions(+), 18 deletions(-) diff --git a/include/ext_nvml.h b/include/ext_nvml.h index 68fd3245e..03671d524 100644 --- a/include/ext_nvml.h +++ b/include/ext_nvml.h @@ -58,6 +58,15 @@ typedef enum nvmlClockType_enum { NVML_CLOCK_MEM = 2 } nvmlClockType_t; +typedef enum nvmlTemperatureThresholds_enum +{ + NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down + // for HW protection + NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin slowdown + // Keep this last + NVML_TEMPERATURE_THRESHOLD_COUNT +} nvmlTemperatureThresholds_t; + /* * End of declarations from nvml.h **/ @@ -76,6 +85,7 @@ typedef nvmlReturn_t (*NVML_DEVICE_GET_FAN_SPEED) (nvmlDevice_t, unsigned int *) typedef nvmlReturn_t (*NVML_DEVICE_GET_POWER_USAGE) (nvmlDevice_t, unsigned int *); typedef nvmlReturn_t (*NVML_DEVICE_GET_UTILIZATION_RATES) (nvmlDevice_t, nvmlUtilization_t *); typedef nvmlReturn_t (*NVML_DEVICE_GET_CLOCKINFO) (nvmlDevice_t, nvmlClockType_t, unsigned int *); +typedef nvmlReturn_t (*NVML_DEVICE_GET_THRESHOLD) (nvmlDevice_t, nvmlTemperatureThresholds_t, unsigned int *); typedef struct { @@ -91,6 +101,7 @@ typedef struct NVML_DEVICE_GET_POWER_USAGE nvmlDeviceGetPowerUsage; NVML_DEVICE_GET_UTILIZATION_RATES nvmlDeviceGetUtilizationRates; NVML_DEVICE_GET_CLOCKINFO nvmlDeviceGetClockInfo; + NVML_DEVICE_GET_THRESHOLD nvmlDeviceGetTemperatureThreshold; } hm_nvml_lib_t; @@ -109,6 +120,7 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetFanSpeed (NVML_PTR *nvml, int, nvmlDevice_t de nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *power); nvmlReturn_t hm_NVML_nvmlDeviceGetUtilizationRates (NVML_PTR *nvml, nvmlDevice_t device, nvmlUtilization_t *utilization); nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); +nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); #endif // HAVE_HWMON && HAVE_NVML diff --git a/include/shared.h b/include/shared.h index b1ddb297f..272458753 100644 --- a/include/shared.h +++ b/include/shared.h @@ -1454,11 +1454,12 @@ int hm_check_fanspeed_control (void *adl, hm_attrs_t *hm_device, u32 *valid_adl_ // void hm_get_opencl_busid_devid (hm_attrs_t *hm_device, uint opencl_num_devices, cl_device_id *devices); #endif // HAVE_ADL -int hm_get_temperature_with_device_id (const uint device_id); -int hm_get_fanspeed_with_device_id (const uint device_id); -int hm_get_utilization_with_device_id (const uint device_id); -int hm_get_memoryspeed_with_device_id (const uint device_id); -int hm_get_corespeed_with_device_id (const uint device_id); +int hm_get_threshold_slowdown_with_device_id (const uint device_id); +int hm_get_temperature_with_device_id (const uint device_id); +int hm_get_fanspeed_with_device_id (const uint device_id); +int hm_get_utilization_with_device_id (const uint device_id); +int hm_get_memoryspeed_with_device_id (const uint device_id); +int hm_get_corespeed_with_device_id (const uint device_id); int hm_set_fanspeed_with_device_id_amd (const uint device_id, const int fanspeed); diff --git a/include/types.h b/include/types.h index 2ccfe0280..df55563c9 100644 --- a/include/types.h +++ b/include/types.h @@ -1098,8 +1098,11 @@ typedef struct } adapter_index; - int od_version; - int fan_supported; + int od_version; + int fan_supported; + + int gpu_temp_threshold_slowdown; + int gpu_temp_threshold_shutdown; // int busid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs // int devid; // used for CL_DEVICE_TOPOLOGY_AMD but broken for dual GPUs diff --git a/src/ext_nvml.c b/src/ext_nvml.c index 2a662148e..31c078ab0 100644 --- a/src/ext_nvml.c +++ b/src/ext_nvml.c @@ -33,6 +33,7 @@ int nvml_init (NVML_PTR *nvml) HC_LOAD_FUNC(nvml, nvmlDeviceGetPowerUsage, NVML_DEVICE_GET_POWER_USAGE, NVML, 0) HC_LOAD_FUNC(nvml, nvmlDeviceGetUtilizationRates, NVML_DEVICE_GET_UTILIZATION_RATES, NVML, 0) HC_LOAD_FUNC(nvml, nvmlDeviceGetClockInfo, NVML_DEVICE_GET_CLOCKINFO, NVML, 0) + HC_LOAD_FUNC(nvml, nvmlDeviceGetTemperatureThreshold, NVML_DEVICE_GET_THRESHOLD, NVML, 0) return 0; } @@ -218,3 +219,20 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device return nvml_rc; } +nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp) +{ + if (!nvml) return -1; + + nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetTemperatureThreshold (device, thresholdType, temp); + + if (nvml_rc != NVML_SUCCESS) + { + *temp = -1; + + //const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc); + + //log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string); + } + + return nvml_rc; +} diff --git a/src/hashcat.c b/src/hashcat.c index fd09ac316..9a60017f8 100644 --- a/src/hashcat.c +++ b/src/hashcat.c @@ -2104,6 +2104,7 @@ static void check_hash (hc_device_param_t *device_param, plain_t *plain) out_fp = stdout; } + lock_file (out_fp); } else @@ -3781,6 +3782,8 @@ static void *thread_monitor (void *p) #ifdef HAVE_HWMON uint hwmon_check = 0; + int slowdown_warnings = 0; + // these variables are mainly used for fan control (AMD only) int *fan_speed_chgd = (int *) mycalloc (data.devices_cnt, sizeof (int)); @@ -3845,8 +3848,52 @@ static void *thread_monitor (void *p) if (data.devices_status != STATUS_RUNNING) continue; - #ifdef HAVE_HWMON + + if (1) + { + hc_thread_mutex_lock (mux_adl); + + for (uint device_id = 0; device_id < data.devices_cnt; device_id++) + { + hc_device_param_t *device_param = &data.devices_param[device_id]; + + if (device_param->skipped) continue; + + if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) continue; + + const int temperature = hm_get_temperature_with_device_id (device_id); + + const int threshold = data.hm_device[device_id].gpu_temp_threshold_slowdown; + + if (temperature >= threshold) + { + if (slowdown_warnings < 3) + { + if (data.quiet == 0) clear_prompt (); + + log_info ("WARNING: Drivers temperature threshold (%dc) hit on GPU #%d, expect performance to drop...", threshold, device_id + 1); + + if (slowdown_warnings == 2) + { + log_info (""); + } + + if (data.quiet == 0) fprintf (stdout, "%s", PROMPT); + if (data.quiet == 0) fflush (stdout); + + slowdown_warnings++; + } + } + else + { + slowdown_warnings = 0; + } + } + + hc_thread_mutex_unlock (mux_adl); + } + if (hwmon_check == 1) { hc_thread_mutex_lock (mux_adl); @@ -13946,11 +13993,11 @@ int main (int argc, char **argv) #ifdef HAVE_HWMON #if defined(HAVE_NVML) || defined(HAVE_NVAPI) - hm_attrs_t hm_adapters_nv[DEVICES_MAX] = { { { 0 }, 0, 0 } }; + hm_attrs_t hm_adapters_nv[DEVICES_MAX] = { { { 0 }, 0, 0, 0, 0 } }; #endif #ifdef HAVE_ADL - hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0 } }; + hm_attrs_t hm_adapters_amd[DEVICES_MAX] = { { { 0 }, 0, 0, 0, 0 } }; #endif if (gpu_temp_disable == 0) @@ -14219,13 +14266,13 @@ int main (int argc, char **argv) } } - /* - * Temporary fix: - * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff - * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately - * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem) - * Driver / ADL bug? - */ + /** + * Temporary fix: + * with AMD r9 295x cards it seems that we need to set the powertune value just AFTER the ocl init stuff + * otherwise after hc_clCreateContext () etc, powertune value was set back to "normal" and cards unfortunately + * were not working @ full speed (setting hm_ADL_Overdrive_PowerControl_Set () here seems to fix the problem) + * Driver / ADL bug? + */ #ifdef HAVE_ADL if (powertune_enable == 1) @@ -15565,6 +15612,19 @@ int main (int argc, char **argv) run_kernel_bzero (device_param, device_param->d_markov_css_buf, size_markov_css); } + /** + * Store thermal target temperature so we can send a notice to user + */ + + #if defined(HAVE_HWMON) + if (gpu_temp_disable == 0) + { + const int gpu_temp_threshold_slowdown = hm_get_threshold_slowdown_with_device_id (device_id); + + data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown == -1) ? 100000 : gpu_temp_threshold_slowdown; + } + #endif + /** * Store initial fanspeed if gpu_temp_retain is enabled */ diff --git a/src/shared.c b/src/shared.c index 6f17e8827..0396e9351 100644 --- a/src/shared.c +++ b/src/shared.c @@ -3057,6 +3057,34 @@ int hm_get_adapter_index_amd (hm_attrs_t *hm_device, u32 *valid_adl_device_list, } #endif // HAVE_ADL +int hm_get_threshold_slowdown_with_device_id (const uint device_id) +{ + if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1; + + #ifdef HAVE_ADL + + #endif + + #if defined(HAVE_NVML) || defined(HAVE_NVAPI) + if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_NV) + { + #if defined(LINUX) && defined(HAVE_NVML) + int target = 0; + + hm_NVML_nvmlDeviceGetTemperatureThreshold (data.hm_nv, data.hm_device[device_id].adapter_index.nv, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, (unsigned int *) &target); + + return target; + #endif + + #if defined(WIN) && defined(HAVE_NVAPI) + + #endif // WIN && HAVE_NVAPI + } + #endif // HAVE_NVML || HAVE_NVAPI + + return -1; +} + int hm_get_temperature_with_device_id (const uint device_id) { if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1; @@ -3169,7 +3197,6 @@ int hm_get_fanspeed_with_device_id (const uint device_id) #endif #if defined(WIN) && defined(HAVE_NVAPI) - NV_GPU_COOLER_SETTINGS pCoolerSettings; pCoolerSettings.Version = GPU_COOLER_SETTINGS_VER | sizeof (NV_GPU_COOLER_SETTINGS);