Added a ton of new NVML stuff

pull/363/head
jsteube 8 years ago
parent d89a8a68d6
commit a23c0c4716

@ -67,6 +67,42 @@ typedef enum nvmlTemperatureThresholds_enum
NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;
/**
* Compute mode.
*
* NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
* Earlier CUDA versions supported a single exclusive mode,
* which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
*/
typedef enum nvmlComputeMode_enum
{
NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Compute-exclusive-thread mode -- only one context per device, usable from one thread at a time
NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
// Keep this last
NVML_COMPUTEMODE_COUNT
} nvmlComputeMode_t;
/**
* GPU Operation Mode
*
* GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
*
* Each GOM is designed to meet specific user needs.
*/
typedef enum nvmlGom_enum
{
NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed
NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations
//!< are not allowed
NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require
//!< high bandwidth double precision
} nvmlGpuOperationMode_t;
/*
* End of declarations from nvml.h
**/
@ -88,6 +124,12 @@ typedef nvmlReturn_t (*NVML_DEVICE_GET_CLOCKINFO) (nvmlDevice_t, nvmlClockType_t
typedef nvmlReturn_t (*NVML_DEVICE_GET_THRESHOLD) (nvmlDevice_t, nvmlTemperatureThresholds_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRPCIELINKGENERATION) (nvmlDevice_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRPCIELINKWIDTH) (nvmlDevice_t, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS) (nvmlDevice_t, unsigned long long *);
typedef nvmlReturn_t (*NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS) (nvmlDevice_t, unsigned long long *);
typedef nvmlReturn_t (*NVML_DEVICE_SET_COMPUTEMODE) (nvmlDevice_t, nvmlComputeMode_t);
typedef nvmlReturn_t (*NVML_DEVICE_SET_OPERATIONMODE) (nvmlDevice_t, nvmlGpuOperationMode_t);
typedef nvmlReturn_t (*NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS) (nvmlDevice_t, unsigned int *, unsigned int *);
typedef nvmlReturn_t (*NVML_DEVICE_SET_POWERMANAGEMENTLIMIT) (nvmlDevice_t, unsigned int);
typedef struct
{
@ -106,6 +148,12 @@ typedef struct
NVML_DEVICE_GET_THRESHOLD nvmlDeviceGetTemperatureThreshold;
NVML_DEVICE_GET_CURRPCIELINKGENERATION nvmlDeviceGetCurrPcieLinkGeneration;
NVML_DEVICE_GET_CURRPCIELINKWIDTH nvmlDeviceGetCurrPcieLinkWidth;
NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS nvmlDeviceGetCurrentClocksThrottleReasons;
NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS nvmlDeviceGetSupportedClocksThrottleReasons;
NVML_DEVICE_SET_COMPUTEMODE nvmlDeviceSetComputeMode;
NVML_DEVICE_SET_OPERATIONMODE nvmlDeviceSetGpuOperationMode;
NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS nvmlDeviceGetPowerManagementLimitConstraints;
NVML_DEVICE_SET_POWERMANAGEMENTLIMIT nvmlDeviceSetPowerManagementLimit;
} hm_nvml_lib_t;
@ -127,6 +175,12 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device
nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkGeneration (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *currLinkGen);
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *currLinkWidth);
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
nvmlReturn_t hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
nvmlReturn_t hm_NVML_nvmlDeviceSetComputeMode (NVML_PTR *nvml, int, nvmlDevice_t device, nvmlComputeMode_t mode);
nvmlReturn_t hm_NVML_nvmlDeviceSetGpuOperationMode (NVML_PTR *nvml, int, nvmlDevice_t device, nvmlGpuOperationMode_t mode);
nvmlReturn_t hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (NVML_PTR *nvml, int, nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
nvmlReturn_t hm_NVML_nvmlDeviceSetPowerManagementLimit (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int limit);
#endif // HAVE_HWMON

@ -1446,6 +1446,7 @@ int hm_check_fanspeed_control (void *adl, hm_attrs_t *hm_device, u32 *valid_adl_
// void hm_get_opencl_busid_devid (hm_attrs_t *hm_device, uint opencl_num_devices, cl_device_id *devices);
int hm_get_threshold_slowdown_with_device_id (const uint device_id);
int hm_get_threshold_shutdown_with_device_id (const uint device_id);
int hm_get_temperature_with_device_id (const uint device_id);
int hm_get_fanspeed_with_device_id (const uint device_id);
int hm_get_fanpolicy_with_device_id (const uint device_id);

@ -56,6 +56,12 @@ int nvml_init (NVML_PTR *nvml)
HC_LOAD_FUNC(nvml, nvmlDeviceGetTemperatureThreshold, NVML_DEVICE_GET_THRESHOLD, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrPcieLinkGeneration, NVML_DEVICE_GET_CURRPCIELINKGENERATION, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrPcieLinkWidth, NVML_DEVICE_GET_CURRPCIELINKWIDTH, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetCurrentClocksThrottleReasons, NVML_DEVICE_GET_CURRENTCLOCKSTHROTTLEREASONS, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetSupportedClocksThrottleReasons, NVML_DEVICE_GET_SUPPORTEDCLOCKSTHROTTLEREASONS, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceSetComputeMode, NVML_DEVICE_SET_COMPUTEMODE, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceSetGpuOperationMode, NVML_DEVICE_SET_OPERATIONMODE, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceGetPowerManagementLimitConstraints, NVML_DEVICE_GET_POWERMANAGEMENTLIMITCONSTRAINTS, NVML, 0)
HC_LOAD_FUNC(nvml, nvmlDeviceSetPowerManagementLimit, NVML_DEVICE_SET_POWERMANAGEMENTLIMIT, NVML, 0)
return 0;
}
@ -153,11 +159,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetTemperature (NVML_PTR *nvml, nvmlDevice_t devi
if (nvml_rc != NVML_SUCCESS)
{
*temp = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperature()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperature()", nvml_rc, string);
}
return nvml_rc;
@ -171,8 +175,6 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetFanSpeed (NVML_PTR *nvml, int skip_warnings, n
if (nvml_rc != NVML_SUCCESS)
{
*speed = -1;
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
@ -184,8 +186,6 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetFanSpeed (NVML_PTR *nvml, int skip_warnings, n
return nvml_rc;
}
/* only tesla following */
nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t device, unsigned int *power)
{
if (!nvml) return -1;
@ -194,11 +194,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetPowerUsage (NVML_PTR *nvml, nvmlDevice_t devic
if (nvml_rc != NVML_SUCCESS)
{
*power = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerUsage()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerUsage()", nvml_rc, string);
}
return nvml_rc;
@ -212,12 +210,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetUtilizationRates (NVML_PTR *nvml, nvmlDevice_t
if (nvml_rc != NVML_SUCCESS)
{
utilization->gpu = -1;
utilization->memory = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
@ -231,11 +226,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetClockInfo (NVML_PTR *nvml, nvmlDevice_t device
if (nvml_rc != NVML_SUCCESS)
{
*clock = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
@ -249,11 +242,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetTemperatureThreshold (NVML_PTR *nvml, nvmlDevi
if (nvml_rc != NVML_SUCCESS)
{
*temp = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetTemperatureThreshold()", nvml_rc, string);
}
return nvml_rc;
@ -267,11 +258,9 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkGeneration (NVML_PTR *nvml, nvmlDe
if (nvml_rc != NVML_SUCCESS)
{
*currLinkGen = -1;
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
@ -285,11 +274,117 @@ nvmlReturn_t hm_NVML_nvmlDeviceGetCurrPcieLinkWidth (NVML_PTR *nvml, nvmlDevice_
if (nvml_rc != NVML_SUCCESS)
{
*currLinkWidth = -1;
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
}
nvmlReturn_t hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *clocksThrottleReasons)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetCurrentClocksThrottleReasons (device, clocksThrottleReasons);
if (nvml_rc != NVML_SUCCESS)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
}
return nvml_rc;
}
nvmlReturn_t hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (NVML_PTR *nvml, nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetSupportedClocksThrottleReasons (device, supportedClocksThrottleReasons);
if (nvml_rc != NVML_SUCCESS)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetSupportedClocksThrottleReasons()", nvml_rc, string);
}
return nvml_rc;
}
nvmlReturn_t hm_NVML_nvmlDeviceSetComputeMode (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, nvmlComputeMode_t mode)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetComputeMode (device, mode);
if (nvml_rc != NVML_SUCCESS)
{
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceSetComputeMode()", nvml_rc, string);
}
}
return nvml_rc;
}
nvmlReturn_t hm_NVML_nvmlDeviceSetGpuOperationMode (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, nvmlGpuOperationMode_t mode)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetGpuOperationMode (device, mode);
if (nvml_rc != NVML_SUCCESS)
{
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceSetGpuOperationMode()", nvml_rc, string);
}
}
return nvml_rc;
}
nvmlReturn_t hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceGetPowerManagementLimitConstraints (device, minLimit, maxLimit);
if (nvml_rc != NVML_SUCCESS)
{
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceGetPowerManagementLimitConstraints()", nvml_rc, string);
}
}
//const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
return nvml_rc;
}
//log_info ("WARN: %s %d %s\n", "nvmlDeviceGetUtilizationRates()", nvml_rc, string);
nvmlReturn_t hm_NVML_nvmlDeviceSetPowerManagementLimit (NVML_PTR *nvml, int skip_warnings, nvmlDevice_t device, unsigned int limit)
{
if (!nvml) return -1;
nvmlReturn_t nvml_rc = nvml->nvmlDeviceSetPowerManagementLimit (device, limit);
if (nvml_rc != NVML_SUCCESS)
{
if (skip_warnings == 0)
{
const char *string = hm_NVML_nvmlErrorString (nvml, nvml_rc);
log_info ("WARN: %s %d %s\n", "nvmlDeviceSetPowerManagementLimit()", nvml_rc, string);
}
}
return nvml_rc;

@ -1547,8 +1547,7 @@ void status_display ()
const int num_corespeed = hm_get_corespeed_with_device_id (device_id);
const int num_memoryspeed = hm_get_memoryspeed_with_device_id (device_id);
const int num_buslanes = hm_get_buslanes_with_device_id (device_id);
// not working
//const int num_throttle = hm_get_throttle_with_device_id (device_id);
const int num_throttle = hm_get_throttle_with_device_id (device_id);
char output_buf[256] = { 0 };
@ -1596,14 +1595,12 @@ void status_display ()
output_len = strlen (output_buf);
}
/*
if (num_throttle >= 0)
if (num_throttle == 1)
{
snprintf (output_buf + output_len, sizeof (output_buf) - output_len, " Throttle:%u", num_throttle);
snprintf (output_buf + output_len, sizeof (output_buf) - output_len, " *Throttled*");
output_len = strlen (output_buf);
}
*/
if (output_len == 0)
{
@ -14013,6 +14010,21 @@ int main (int argc, char **argv)
unsigned int speed;
if (hm_NVML_nvmlDeviceGetFanSpeed (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, &speed) != NVML_ERROR_NOT_SUPPORTED) hm_adapters_nv[i].fan_get_supported = 1;
hm_NVML_nvmlDeviceSetComputeMode (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, NVML_COMPUTEMODE_EXCLUSIVE_PROCESS);
hm_NVML_nvmlDeviceSetGpuOperationMode (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, NVML_GOM_ALL_ON);
unsigned int minLimit;
unsigned int maxLimit;
if (hm_NVML_nvmlDeviceGetPowerManagementLimitConstraints (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, &minLimit, &maxLimit) == NVML_SUCCESS)
{
if (maxLimit > 0)
{
hm_NVML_nvmlDeviceSetPowerManagementLimit (data.hm_nv, 1, hm_adapters_nv[i].adapter_index.nv, maxLimit);
}
}
}
}
}
@ -15563,8 +15575,12 @@ int main (int argc, char **argv)
if (gpu_temp_disable == 0)
{
const int gpu_temp_threshold_slowdown = hm_get_threshold_slowdown_with_device_id (device_id);
const int gpu_temp_threshold_shutdown = hm_get_threshold_slowdown_with_device_id (device_id);
data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown > 0) ? gpu_temp_threshold_slowdown : 10000;
data.hm_device[device_id].gpu_temp_threshold_shutdown = (gpu_temp_threshold_shutdown > 0) ? gpu_temp_threshold_shutdown : 10000;
data.hm_device[device_id].gpu_temp_threshold_slowdown = (gpu_temp_threshold_slowdown == -1) ? 100000 : gpu_temp_threshold_slowdown;
// we could use those numbers for gpu_temp_retain and gpu_temp_abort, too
}
/**

@ -3074,6 +3074,37 @@ int hm_get_threshold_slowdown_with_device_id (const uint device_id)
return -1;
}
int hm_get_threshold_shutdown_with_device_id (const uint device_id)
{
if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_AMD)
{
if (data.hm_amd)
{
if (data.hm_device[device_id].od_version == 5)
{
}
else if (data.hm_device[device_id].od_version == 6)
{
}
}
}
if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
{
int target = 0;
hm_NVML_nvmlDeviceGetTemperatureThreshold (data.hm_nv, data.hm_device[device_id].adapter_index.nv, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, (unsigned int *) &target);
return target;
}
return -1;
}
int hm_get_temperature_with_device_id (const uint device_id)
{
if ((data.devices_param[device_id].device_type & CL_DEVICE_TYPE_GPU) == 0) return -1;
@ -3341,7 +3372,16 @@ int hm_get_throttle_with_device_id (const uint device_id)
if (data.devices_param[device_id].device_vendor_id == VENDOR_ID_NV)
{
unsigned long long clocksThrottleReasons = 0;
unsigned long long supportedThrottleReasons = 0;
hm_NVML_nvmlDeviceGetCurrentClocksThrottleReasons (data.hm_nv, data.hm_device[device_id].adapter_index.nv, &clocksThrottleReasons);
hm_NVML_nvmlDeviceGetSupportedClocksThrottleReasons (data.hm_nv, data.hm_device[device_id].adapter_index.nv, &supportedThrottleReasons);
clocksThrottleReasons &= supportedThrottleReasons;
return (clocksThrottleReasons > 0);
}
return -1;

Loading…
Cancel
Save