Rework GPUs to allow for multiple

Using sysfs we iterate through available GPUs and determin which one is the active GPU through fdinfo of the app. gpu_stats is rewritten to display all available GPUs and their stats or the current active gpu using the `active_gpu` parameter. vram is likewise rewritten to display vram for all GPUs or active gpu. throttling only displays data for the active GPU as we don't expect an idling GPU to give relevant throttling information.
flightlessmango · Sep 17, 2024 · fd65d66 · fd65d66
1 parent feef6e3
commit fd65d66
Show file tree

Hide file tree

Showing 32 changed files with 1,317 additions and 1,320 deletions.
diff --git a/meson.build b/meson.build
@@ -3,7 +3,7 @@ project('MangoHud',
   version : 'v0.7.2',
   license : 'MIT',
   meson_version: '>=0.60.0',
-  default_options : ['buildtype=release', 'c_std=c99', 'cpp_std=c++14', 'warning_level=2']
+  default_options : ['buildtype=release', 'c_std=c99', 'cpp_std=c++17', 'warning_level=2']
 )
 
 cc = meson.get_compiler('c')

diff --git a/src/amdgpu.cpp b/src/amdgpu.cpp
@@ -11,15 +11,7 @@
 #include "logging.h"
 #include "mesa/util/macros.h"
 
-std::string metrics_path = "";
-struct amdgpu_common_metrics amdgpu_common_metrics;
-std::mutex amdgpu_common_metrics_m;
-std::mutex amdgpu_m;
-std::condition_variable amdgpu_c;
-bool amdgpu_run_thread = true;
-std::unique_ptr<Throttling> throttling;
-
-bool amdgpu_verify_metrics(const std::string& path)
+bool AMDGPU::verify_metrics(const std::string& path)
 {
 	metrics_table_header header {};
 	FILE *f;
@@ -45,7 +37,8 @@ bool amdgpu_verify_metrics(const std::string& path)
 		case 2: // v2_1, v2_2, v2_3, v2_4
 			if(header.content_revision<=0 || header.content_revision>4)// v2_0, not naturally aligned
 				break;
-			cpuStats.cpu_type = "APU";
+
+			this->is_apu = true;
 			return true;
 		default:
 			break;
@@ -56,18 +49,18 @@ bool amdgpu_verify_metrics(const std::string& path)
 }
 
 #define IS_VALID_METRIC(FIELD) (FIELD != 0xffff)
-void amdgpu_get_instant_metrics(struct amdgpu_common_metrics *metrics) {
+void AMDGPU::get_instant_metrics(struct amdgpu_common_metrics *metrics) {
 	FILE *f;
 	void *buf[MAX(sizeof(struct gpu_metrics_v1_3), sizeof(struct gpu_metrics_v2_4))/sizeof(void*)+1];
 	struct metrics_table_header* header = (metrics_table_header*)buf;
 
-	f = fopen(metrics_path.c_str(), "rb");
+	f = fopen(gpu_metrics_path.c_str(), "rb");
 	if (!f)
 		return;
 
 	// Read the whole file
 	if (fread(buf, sizeof(buf), 1, f) != 0) {
-		SPDLOG_DEBUG("amdgpu metrics file '{}' is larger than the buffer", metrics_path.c_str());
+		SPDLOG_DEBUG("amdgpu metrics file '{}' is larger than the buffer", gpu_metrics_path.c_str());
 		fclose(f);
 		return;
 	}
@@ -105,42 +98,7 @@ void amdgpu_get_instant_metrics(struct amdgpu_common_metrics *metrics) {
 			do metrics->average_cpu_power_w = metrics->average_cpu_power_w + amdgpu_metrics->average_core_power[i] / 1000.f;
 			while (++i < ARRAY_SIZE(amdgpu_metrics->average_core_power) && IS_VALID_METRIC(amdgpu_metrics->average_core_power[i]));
 		} else if( IS_VALID_METRIC(amdgpu_metrics->average_socket_power) && IS_VALID_METRIC(amdgpu_metrics->average_gfx_power) ) {
-			// fallback 2: estimate cpu power from total socket power
-			metrics->average_cpu_power_w = amdgpu_metrics->average_socket_power / 1000.f - amdgpu_metrics->average_gfx_power / 1000.f;
-		} else {
-			// giving up
-			metrics->average_cpu_power_w = 0;
-		}
-
-		if( IS_VALID_METRIC(amdgpu_metrics->current_gfxclk) ) {
-			// prefered method
-			metrics->current_gfxclk_mhz = amdgpu_metrics->current_gfxclk;
-		} else if( IS_VALID_METRIC(amdgpu_metrics->average_gfxclk_frequency) ) {
-			// fallback 1
-			metrics->current_gfxclk_mhz = amdgpu_metrics->average_gfxclk_frequency;
-		} else {
-			// giving up
-			metrics->current_gfxclk_mhz = 0;
-		}
-		if( IS_VALID_METRIC(amdgpu_metrics->current_uclk) ) {
-			// prefered method
-			metrics->current_uclk_mhz = amdgpu_metrics->current_uclk;
-		} else if( IS_VALID_METRIC(amdgpu_metrics->average_uclk_frequency) ) {
-			// fallback 1
-			metrics->current_uclk_mhz = amdgpu_metrics->average_uclk_frequency;
-		} else {
-			// giving up
-			metrics->current_uclk_mhz = 0;
-		}
-
-		if( IS_VALID_METRIC(amdgpu_metrics->temperature_soc) ) {
-			// prefered method
-			metrics->soc_temp_c = amdgpu_metrics->temperature_soc / 100;
-		} else if( header->content_revision >= 3 && IS_VALID_METRIC(amdgpu_metrics->average_temperature_soc) ) {
-			// fallback 1
-			metrics->soc_temp_c = amdgpu_metrics->average_temperature_soc / 100;
-		} else {
-			// giving up
+			// fallback 2: estimate cpu power frostd::string pci_dev, uint32_t deviceID, uint32_t vendorID
 			metrics->soc_temp_c = 0;
 		}
 		if( IS_VALID_METRIC(amdgpu_metrics->temperature_gfx) ) {
@@ -167,9 +125,11 @@ void amdgpu_get_instant_metrics(struct amdgpu_common_metrics *metrics) {
 			do cpu_temp = MAX(cpu_temp, amdgpu_metrics->average_temperature_core[i]);
 			while (++i < ARRAY_SIZE(amdgpu_metrics->average_temperature_core) && IS_VALID_METRIC(amdgpu_metrics->average_temperature_core[i]));
 			metrics->apu_cpu_temp_c = cpu_temp / 100;
+#ifdef DETECT_OS_UNIX
 		} else if( cpuStats.ReadcpuTempFile(cpu_temp) ) {
 			// fallback 2: Try temp from file 'm_cpuTempFile' of 'cpu.cpp'
 			metrics->apu_cpu_temp_c = cpu_temp;
+#endif
 		} else {
 			// giving up
 			metrics->apu_cpu_temp_c = 0;
@@ -189,10 +149,12 @@ void amdgpu_get_instant_metrics(struct amdgpu_common_metrics *metrics) {
 		throttling->indep_throttle_status = indep_throttle_status;
 }
 
-void amdgpu_get_samples_and_copy(struct amdgpu_common_metrics metrics_buffer[METRICS_SAMPLE_COUNT], bool &gpu_load_needs_dividing) {
+void AMDGPU::get_samples_and_copy(struct amdgpu_common_metrics metrics_buffer[METRICS_SAMPLE_COUNT], bool &gpu_load_needs_dividing) {
+	while (!stop_thread) {
 		// Get all the samples
 		for (size_t cur_sample_id=0; cur_sample_id < METRICS_SAMPLE_COUNT; cur_sample_id++) {
-			amdgpu_get_instant_metrics(&metrics_buffer[cur_sample_id]);
+			if (gpu_metrics_is_valid)
+				get_instant_metrics(&metrics_buffer[cur_sample_id]);
 
 			// Detect and fix if the gpu load is reported in centipercent
 			if (gpu_load_needs_dividing || metrics_buffer[cur_sample_id].gpu_load_percent > 100){
@@ -203,34 +165,65 @@ void amdgpu_get_samples_and_copy(struct amdgpu_common_metrics metrics_buffer[MET
 			usleep(METRICS_POLLING_PERIOD_MS * 1000);
 		}
 
-		// Copy the results from the different metrics to amdgpu_common_metrics
-		amdgpu_common_metrics_m.lock();
-		UPDATE_METRIC_AVERAGE(gpu_load_percent);
-		UPDATE_METRIC_AVERAGE_FLOAT(average_gfx_power_w);
-		UPDATE_METRIC_AVERAGE_FLOAT(average_cpu_power_w);
+		if (stop_thread) break;
+
+        std::unique_lock<std::mutex> lock(metrics_mutex);
+        cond_var.wait(lock, [this]() { return !paused || stop_thread; });
+		// do one pass of metrics from sysfs nodes
+		// then we replace with GPU metrics if it's available
+		get_sysfs_metrics();
+
+		if (gpu_metrics_is_valid) {
+			UPDATE_METRIC_AVERAGE(gpu_load_percent);
+			UPDATE_METRIC_AVERAGE_FLOAT(average_gfx_power_w);
+			UPDATE_METRIC_AVERAGE_FLOAT(average_cpu_power_w);
+
+			UPDATE_METRIC_AVERAGE(current_gfxclk_mhz);
+			UPDATE_METRIC_AVERAGE(current_uclk_mhz);
+
+			UPDATE_METRIC_AVERAGE(soc_temp_c);
+			UPDATE_METRIC_AVERAGE(gpu_temp_c);
+			UPDATE_METRIC_AVERAGE(apu_cpu_temp_c);
 
-		UPDATE_METRIC_AVERAGE(current_gfxclk_mhz);
-		UPDATE_METRIC_AVERAGE(current_uclk_mhz);
+			UPDATE_METRIC_MAX(is_power_throttled);
+			UPDATE_METRIC_MAX(is_current_throttled);
+			UPDATE_METRIC_MAX(is_temp_throttled);
+			UPDATE_METRIC_MAX(is_other_throttled);
 
-		UPDATE_METRIC_AVERAGE(soc_temp_c);
-		UPDATE_METRIC_AVERAGE(gpu_temp_c);
-		UPDATE_METRIC_AVERAGE(apu_cpu_temp_c);
+			UPDATE_METRIC_MAX(fan_speed);
+			metrics.fan_rpm = true;
 
-		UPDATE_METRIC_MAX(is_power_throttled);
-		UPDATE_METRIC_MAX(is_current_throttled);
-		UPDATE_METRIC_MAX(is_temp_throttled);
-		UPDATE_METRIC_MAX(is_other_throttled);
+			metrics.load = amdgpu_common_metrics.gpu_load_percent;
+			metrics.powerUsage = amdgpu_common_metrics.average_gfx_power_w;
+			metrics.MemClock = amdgpu_common_metrics.current_uclk_mhz;
 
-		UPDATE_METRIC_MAX(fan_speed);
-		amdgpu_common_metrics_m.unlock();
+			// Use hwmon instead, see gpu.cpp
+			if ( deviceID == 0x1435 || deviceID == 0x163f )
+			{
+				// If we are on VANGOGH (Steam Deck), then
+				// always use core clock from GPU metrics.
+				metrics.CoreClock = amdgpu_common_metrics.current_gfxclk_mhz;
+			}
+			metrics.temp = amdgpu_common_metrics.gpu_temp_c;
+			metrics.apu_cpu_power = amdgpu_common_metrics.average_cpu_power_w;
+			metrics.apu_cpu_temp = amdgpu_common_metrics.apu_cpu_temp_c;
+
+			metrics.is_power_throttled = amdgpu_common_metrics.is_power_throttled;
+			metrics.is_current_throttled = amdgpu_common_metrics.is_current_throttled;
+			metrics.is_temp_throttled = amdgpu_common_metrics.is_temp_throttled;
+			metrics.is_other_throttled = amdgpu_common_metrics.is_other_throttled;
+
+			metrics.fan_speed = amdgpu_common_metrics.fan_speed;
+		}
+	}
 }
 
-void amdgpu_metrics_polling_thread() {
+void AMDGPU::metrics_polling_thread() {
 	struct amdgpu_common_metrics metrics_buffer[METRICS_SAMPLE_COUNT];
 	bool gpu_load_needs_dividing = false;  //some GPUs report load as centipercent
 
 	// Initial poll of the metrics, so that we have values to display as fast as possible
-	amdgpu_get_instant_metrics(&amdgpu_common_metrics);
+	get_instant_metrics(&amdgpu_common_metrics);
 	if (amdgpu_common_metrics.gpu_load_percent > 100){
 		gpu_load_needs_dividing = true;
 		amdgpu_common_metrics.gpu_load_percent /= 100;
@@ -240,48 +233,156 @@ void amdgpu_metrics_polling_thread() {
 	memset(metrics_buffer, 0, sizeof(metrics_buffer));
 
 	while (1) {
-		std::unique_lock<std::mutex> lock(amdgpu_m);
-		amdgpu_c.wait(lock, []{return amdgpu_run_thread;});
-		lock.unlock();
 #ifndef TEST_ONLY
 		if (HUDElements.params->no_display && !logger->is_active())
 			usleep(100000);
 		else
 #endif
-			amdgpu_get_samples_and_copy(metrics_buffer, gpu_load_needs_dividing);
+			get_samples_and_copy(metrics_buffer, gpu_load_needs_dividing);
 	}
 }
 
-void amdgpu_get_metrics(uint32_t deviceID){
-	static bool init = false;
-	if (!init){
-		std::thread(amdgpu_metrics_polling_thread).detach();
-		init = true;
+void AMDGPU::get_sysfs_metrics() {
+    int64_t value = 0;
+	if (sysfs_nodes.busy) {
+		rewind(sysfs_nodes.busy);
+		fflush(sysfs_nodes.busy);
+		int value = 0;
+		if (fscanf(sysfs_nodes.busy, "%d", &value) != 1)
+			value = 0;
+		metrics.load = value;
 	}
 
-	amdgpu_common_metrics_m.lock();
-	gpu_info.load = amdgpu_common_metrics.gpu_load_percent;
+	if (sysfs_nodes.memory_clock) {
+		rewind(sysfs_nodes.memory_clock);
+		fflush(sysfs_nodes.memory_clock);
+		if (fscanf(sysfs_nodes.memory_clock, "%" PRId64, &value) != 1)
+			value = 0;
 
-	gpu_info.powerUsage = amdgpu_common_metrics.average_gfx_power_w;
-	gpu_info.MemClock = amdgpu_common_metrics.current_uclk_mhz;
+		metrics.MemClock = value / 1000000;
+	}
 
-	// Use hwmon instead, see gpu.cpp
-	if ( deviceID == 0x1435 || deviceID == 0x163f )
-	{
-		// If we are on VANGOGH (Steam Deck), then
-		// always use use core clock from GPU metrics.
-		gpu_info.CoreClock = amdgpu_common_metrics.current_gfxclk_mhz;
+	// TODO: on some gpus this will use the power1_input instead
+	// this value is instantaneous and should be averaged over time
+	// probably just average everything in this function to be safe
+	if (sysfs_nodes.power_usage) {
+		rewind(sysfs_nodes.power_usage);
+		fflush(sysfs_nodes.power_usage);
+		if (fscanf(sysfs_nodes.power_usage, "%" PRId64, &value) != 1)
+			value = 0;
+
+		metrics.powerUsage = value / 1000000;
+	}
+
+	if (sysfs_nodes.fan) {
+		rewind(sysfs_nodes.fan);
+		fflush(sysfs_nodes.fan);
+		if (fscanf(sysfs_nodes.fan, "%" PRId64, &value) != 1)
+			value = 0;
+		metrics.fan_speed = value;
+		metrics.fan_rpm = true;
+	}
+
+	if (sysfs_nodes.vram_total) {
+		rewind(sysfs_nodes.vram_total);
+		fflush(sysfs_nodes.vram_total);
+		if (fscanf(sysfs_nodes.vram_total, "%" PRId64, &value) != 1)
+			value = 0;
+		metrics.memoryTotal = float(value) / (1024 * 1024 * 1024);
+	}
+
+	if (sysfs_nodes.vram_used) {
+		rewind(sysfs_nodes.vram_used);
+		fflush(sysfs_nodes.vram_used);
+		if (fscanf(sysfs_nodes.vram_used, "%" PRId64, &value) != 1)
+			value = 0;
+		metrics.memoryUsed = float(value) / (1024 * 1024 * 1024);
+	}
+	// On some GPUs SMU can sometimes return the wrong temperature.
+	// As HWMON is way more visible than the SMU metrics, let's always trust it as it is the most likely to work
+	if (sysfs_nodes.core_clock) {
+		rewind(sysfs_nodes.core_clock);
+		fflush(sysfs_nodes.core_clock);
+		if (fscanf(sysfs_nodes.core_clock, "%" PRId64, &value) != 1)
+			value = 0;
+
+		metrics.CoreClock = value / 1000000;
 	}
-	// gpu_info.temp = amdgpu_common_metrics.gpu_temp_c;
-	gpu_info.apu_cpu_power = amdgpu_common_metrics.average_cpu_power_w;
-	gpu_info.apu_cpu_temp = amdgpu_common_metrics.apu_cpu_temp_c;
 
-	gpu_info.is_power_throttled = amdgpu_common_metrics.is_power_throttled;
-	gpu_info.is_current_throttled = amdgpu_common_metrics.is_current_throttled;
-	gpu_info.is_temp_throttled = amdgpu_common_metrics.is_temp_throttled;
-	gpu_info.is_other_throttled = amdgpu_common_metrics.is_other_throttled;
+	if (sysfs_nodes.temp){
+		rewind(sysfs_nodes.temp);
+		fflush(sysfs_nodes.temp);
+		int value = 0;
+		if (fscanf(sysfs_nodes.temp, "%d", &value) != 1)
+			value = 0;
+		metrics.temp = value / 1000;
+	}
 
-	gpu_info.fan_speed = amdgpu_common_metrics.fan_speed;
+	if (sysfs_nodes.junction_temp){
+		rewind(sysfs_nodes.junction_temp);
+		fflush(sysfs_nodes.junction_temp);
+		int value = 0;
+		if (fscanf(sysfs_nodes.junction_temp, "%d", &value) != 1)
+			value = 0;
+		metrics.junction_temp = value / 1000;
+	}
 
-	amdgpu_common_metrics_m.unlock();
+	if (sysfs_nodes.memory_temp){
+		rewind(sysfs_nodes.memory_temp);
+		fflush(sysfs_nodes.memory_temp);
+		int value = 0;
+		if (fscanf(sysfs_nodes.memory_temp, "%d", &value) != 1)
+			value = 0;
+		metrics.memory_temp = value / 1000;
+	}
+
+	if (sysfs_nodes.gtt_used) {
+		rewind(sysfs_nodes.gtt_used);
+		fflush(sysfs_nodes.gtt_used);
+		if (fscanf(sysfs_nodes.gtt_used, "%" PRId64, &value) != 1)
+			value = 0;
+		metrics.gtt_used = float(value) / (1024 * 1024 * 1024);
+	}
+
+	if (sysfs_nodes.gpu_voltage_soc) {
+		rewind(sysfs_nodes.gpu_voltage_soc);
+		fflush(sysfs_nodes.gpu_voltage_soc);
+		if (fscanf(sysfs_nodes.gpu_voltage_soc, "%" PRId64, &value) != 1)
+			value = 0;
+		metrics.voltage = value;
+	}
 }
+
+AMDGPU::AMDGPU(std::string pci_dev, uint32_t device_id, uint32_t vendor_id) {
+	this->pci_dev = pci_dev;
+	this->device_id = device_id;
+	this->vendor_id = vendor_id;
+	const std::string device_path = "/sys/bus/pci/devices/" + pci_dev;
+	gpu_metrics_path = device_path + "/gpu_metrics";
+	gpu_metrics_is_valid = verify_metrics(gpu_metrics_path);
+
+	sysfs_nodes.busy = fopen((device_path + "/gpu_busy_percent").c_str(), "r");
+	sysfs_nodes.vram_total = fopen((device_path + "/mem_info_vram_total").c_str(), "r");
+	sysfs_nodes.vram_used = fopen((device_path + "/mem_info_vram_used").c_str(), "r");
+	sysfs_nodes.gtt_used = fopen((device_path + "/mem_info_gtt_used").c_str(), "r");
+
+	const std::string hwmon_path = device_path + "/hwmon/";
+	if (fs::exists(hwmon_path)){
+		const auto dirs = ls(hwmon_path.c_str(), "hwmon", LS_DIRS);
+		for (const auto& dir : dirs) {
+			sysfs_nodes.temp = fopen((hwmon_path + dir + "/temp1_input").c_str(), "r");
+			sysfs_nodes.junction_temp = fopen((hwmon_path + dir + "/temp2_input").c_str(), "r");
+			sysfs_nodes.memory_temp = fopen((hwmon_path + dir + "/temp3_input").c_str(), "r");
+			sysfs_nodes.core_clock = fopen((hwmon_path + dir + "/freq1_input").c_str(), "r");
+			sysfs_nodes.gpu_voltage_soc = fopen((hwmon_path + dir + "/in0_input").c_str(), "r");
+			sysfs_nodes.memory_clock = fopen((hwmon_path + dir + "/freq2_input").c_str(), "r");
+			sysfs_nodes.power_usage = fopen((hwmon_path + dir + "/power1_average").c_str(), "r");
+			sysfs_nodes.power_usage = fopen((hwmon_path + dir + "/power1_input").c_str(), "r");
+			sysfs_nodes.fan = fopen((hwmon_path + dir + "/fan1_input").c_str(), "r");
+		}
+	}
+
+	throttling = std::make_shared<Throttling>(0x1002);
+	std::thread thread(&AMDGPU::metrics_polling_thread, this);
+	thread.detach();
+}