Refactor NVML, allow unavailable items to disappear, make thermal dis…

…play color thresholds configurable both compile-time and runtime
xmrig · Mar 21, 2019 · edb78f7 · edb78f7
1 parent f0469b5
commit edb78f7
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 37 deletions.
diff --git a/src/defaults.h b/src/defaults.h
@@ -0,0 +1,31 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <[email protected]>
+ * Copyright 2012-2014 pooler      <[email protected]>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <[email protected]>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <[email protected]>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __DEFAULTS_H__
+#define __DEFAULTS_H__
+
+//temperature display points
+// (below L is green, between is yellow, above H is red)
+#define DFL_nvmlTempL 45
+#define DFL_nvmlTempH 65
+
+#endif /* __DEFAULTS_H__ */
diff --git a/src/nvidia/NvmlApi.cpp b/src/nvidia/NvmlApi.cpp
@@ -30,7 +30,7 @@
 
 
 static uv_lib_t nvmlLib;
-static char nvmlVerion[80] = { 0 };
+static char nvmlVersion[NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE] = { 0 };
 
 
 bool NvmlApi::m_available = false;
@@ -50,8 +50,8 @@ static nvmlReturn_t(*pNvmlDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t *
 bool NvmlApi::init()
 {
 #   ifdef _WIN32
-    char tmp[512];
-    ExpandEnvironmentStringsA("%PROGRAMFILES%\\NVIDIA Corporation\\NVSMI\\nvml.dll", tmp, sizeof(tmp));
+    char tmp[261]; //LoadLibrary calls are still "260 char" limited
+    ExpandEnvironmentStringsA(R"(%ProgramFiles%\NVIDIA Corporation\NVSMI\nvml.dll)", tmp, sizeof(tmp));
     if (uv_dlopen(tmp, &nvmlLib) == -1 && uv_dlopen("nvml.dll", &nvmlLib) == -1) {
         return false;
     }
@@ -78,7 +78,7 @@ bool NvmlApi::init()
     m_available = pNvmlInit() == NVML_SUCCESS;
 
     if (pNvmlSystemGetNVMLVersion) {
-        pNvmlSystemGetNVMLVersion(nvmlVerion, sizeof(nvmlVerion));
+        pNvmlSystemGetNVMLVersion(nvmlVersion, sizeof(nvmlVersion));
     }
 
     return m_available;
@@ -95,34 +95,52 @@ void NvmlApi::release()
 }
 
 
-bool NvmlApi::health(int id, Health &health)
+bool NvmlApi::health(int i, Health &health)
 {
-    if (id == -1 || !isAvailable()) {
+    const auto id = static_cast<unsigned int>(i);
+    nvmlDevice_t device;
+
+    if (i == -1 || !isAvailable()
+        ||
+        (pNvmlDeviceGetHandleByIndex && pNvmlDeviceGetHandleByIndex(id, &device) != NVML_SUCCESS)
+    ) {
         return false;
     }
 
-    health.reset();
+    // cache items previously pegged as unavailable via function call failure
+    // this has to happen before the reset or we don't see the previous value
+    const bool hasPowerUsage = MAXUINT32 != health.power;
+    const bool hasFanSpeed   = MAXUINT32 != health.fanSpeed;
+    const bool hasClockInfo  = MAXUINT32 != health.clock;
 
-    nvmlDevice_t device;
-    if (pNvmlDeviceGetHandleByIndex && pNvmlDeviceGetHandleByIndex(id, &device) != NVML_SUCCESS) {
-        return false;
-    }
+    health.reset();
 
     if (pNvmlDeviceGetTemperature) {
         pNvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &health.temperature);
     }
 
     if (pNvmlDeviceGetPowerUsage) {
-        pNvmlDeviceGetPowerUsage(device, &health.power);
+        if (!hasPowerUsage || pNvmlDeviceGetPowerUsage(device, &health.power) != NVML_SUCCESS){
+            health.power = MAXUINT32;
+        }
     }
 
     if (pNvmlDeviceGetFanSpeed) {
-        pNvmlDeviceGetFanSpeed(device, &health.fanSpeed);
+        if (!hasFanSpeed || pNvmlDeviceGetFanSpeed(device, &health.fanSpeed) != NVML_SUCCESS){
+            health.fanSpeed = MAXUINT32;
+        }
     }
 
     if (pNvmlDeviceGetClockInfo) {
-        pNvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &health.clock);
-        pNvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &health.memClock);
+        if (!hasClockInfo
+            ||
+            pNvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &health.clock) != NVML_SUCCESS
+            ||
+            pNvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &health.memClock) != NVML_SUCCESS
+        ) {
+            health.clock = MAXUINT32;
+            health.memClock = MAXUINT32;
+        }
     }
 
     return true;
@@ -131,7 +149,7 @@ bool NvmlApi::health(int id, Health &health)
 
 const char *NvmlApi::version()
 {
-    return nvmlVerion;
+    return nvmlVersion;
 }
 
 
@@ -158,7 +176,7 @@ void NvmlApi::bind(const std::vector<xmrig::IThread*> &threads)
         }
 
         for (xmrig::IThread *t : threads) {
-            auto thread = static_cast<CudaThread *>(t);
+            auto thread = dynamic_cast<CudaThread *>(t);
             if (thread->pciBusID() == pci.bus && thread->pciDeviceID() == pci.device && thread->pciDomainID() == pci.domain) {
                 thread->setNvmlId(i);
                 break;

diff --git a/src/workers/CudaThread.cpp b/src/workers/CudaThread.cpp
@@ -27,6 +27,7 @@
 #include <string.h>
 
 
+#include "defaults.h"
 #include "rapidjson/document.h"
 #include "workers/CudaThread.h"
 
@@ -38,6 +39,8 @@ CudaThread::CudaThread() :
     m_clockRate(0),
     m_memoryClockRate(0),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(0),
     m_threads(0),
     m_affinity(-1),
@@ -63,6 +66,8 @@ CudaThread::CudaThread(const nvid_ctx &ctx, int64_t affinity, xmrig::Algo algori
     m_clockRate(ctx.device_clockRate),
     m_memoryClockRate(ctx.device_memoryClockRate),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(ctx.device_mpcount),
     m_threads(ctx.device_threads),
     m_affinity(affinity),
@@ -88,6 +93,8 @@ CudaThread::CudaThread(const rapidjson::Value &object) :
     m_clockRate(0),
     m_memoryClockRate(0),
     m_nvmlId(-1),
+    m_nvmlTempL(DFL_nvmlTempL),
+    m_nvmlTempH(DFL_nvmlTempH),
     m_smx(0),
     m_threads(0),
     m_affinity(-1),
@@ -117,6 +124,16 @@ CudaThread::CudaThread(const rapidjson::Value &object) :
     if (affinity.IsInt()) {
         setAffinity(affinity.GetInt());
     }
+
+    const rapidjson::Value &tempL = object["temp_low"];
+    if (tempL.IsInt()) {
+        setNvmlTempL(tempL.GetInt());
+    }
+
+    const rapidjson::Value &tempH = object["temp_high"];
+    if (tempH.IsInt()) {
+        setNvmlTempH(tempH.GetInt());
+    }
 }
 
 

diff --git a/src/workers/CudaThread.h b/src/workers/CudaThread.h
@@ -53,6 +53,8 @@ class CudaThread : public xmrig::IThread
     inline size_t memoryTotal() const     { return m_memoryTotal; }
     inline size_t memoryFree() const      { return m_memoryFree; }
     inline int nvmlId() const             { return m_nvmlId; }
+    inline uint32_t nvmlTempL() const     { return m_nvmlTempL; }
+    inline uint32_t nvmlTempH() const     { return m_nvmlTempH; }
     inline int smx() const                { return m_smx; }
     inline int threads() const            { return m_threads; }
     inline size_t threadId() const        { return m_threadId; }
@@ -74,6 +76,8 @@ class CudaThread : public xmrig::IThread
     inline void setBSleep(int bsleep)          { m_bsleep = bsleep; }
     inline void setIndex(size_t index)         { m_index = index; }
     inline void setNvmlId(int id)              { m_nvmlId = id; }
+    inline void setNvmlTempL(uint32_t temp)    { m_nvmlTempL = temp; }
+    inline void setNvmlTempH(uint32_t temp)    { m_nvmlTempH = temp; }
     inline void setThreadId(size_t threadId)   { m_threadId = threadId; }
     inline void setThreads(int threads)        { m_threads = threads; }
     inline void setSyncMode(uint32_t syncMode) { m_syncMode = syncMode > 3 ? 3 : syncMode; }
@@ -98,6 +102,8 @@ class CudaThread : public xmrig::IThread
     int m_clockRate;
     int m_memoryClockRate;
     int m_nvmlId;
+    uint32_t m_nvmlTempL;
+    uint32_t m_nvmlTempH;
     int m_smx;
     int m_threads;
     int64_t m_affinity;

diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp
@@ -113,7 +113,7 @@ void Workers::printHashrate(bool detail)
 
         size_t i = 0;
         for (const xmrig::IThread *t : m_controller->config()->threads()) {
-            auto thread = static_cast<const CudaThread *>(t);
+            auto thread = dynamic_cast<const CudaThread *>(t);
              Log::i()->text("| %6zu | %3zu | %7s | %7s | %7s | %s%s",
                             i, thread->index(),
                             Hashrate::format(m_hashrate->calc(i, Hashrate::ShortInterval), num1, sizeof num1),
@@ -130,6 +130,18 @@ void Workers::printHashrate(bool detail)
     m_hashrate->print();
 }
 
+const std::string _spf(const char * const fmt, ...)
+{
+    va_list args = nullptr, copy = nullptr;
+    va_start(args, fmt);
+    va_copy(copy, args);
+    const auto len = static_cast<const unsigned __int64>(std::vsnprintf(nullptr, 0, fmt, copy));
+    va_end(copy);
+    std::vector<char> str(len + 1);
+    std::vsnprintf(str.data(), str.size(), fmt, args);
+    va_end(args);
+    return std::string(str.data(), len);
+}
 
 void Workers::printHealth()
 {
@@ -140,32 +152,52 @@ void Workers::printHealth()
 
     Health health;
     for (const xmrig::IThread *t : m_controller->config()->threads()) {
-        auto thread = static_cast<const CudaThread *>(t);
+        auto thread = dynamic_cast<const CudaThread *>(t);
         if (!NvmlApi::health(thread->nvmlId(), health)) {
             continue;
         }
 
-        const uint32_t temp = health.temperature;
-
-        if (health.clock && health.clock) {
-            if (m_controller->config()->isColors()) {
-                LOG_INFO("\x1B[00;35mGPU #%d: \x1B[01m%u\x1B[00;35m/\x1B[01m%u MHz\x1B[00;35m \x1B[01m%uW\x1B[00;35m %s%uC\x1B[00;35m FAN \x1B[01m%u%%",
-                    thread->index(), health.clock, health.memClock, health.power / 1000, (temp < 45 ? "\x1B[01;32m" : (temp > 65 ? "\x1B[01;31m" : "\x1B[01;33m")), temp, health.fanSpeed);
-            }
-            else {
-                LOG_INFO(" * GPU #%d: %u/%u MHz %uW %uC FAN %u%%", thread->index(), health.clock, health.memClock, health.power / 1000, health.temperature, health.fanSpeed);
-            }
-
-            continue;
+        const bool isColors = m_controller->config()->isColors();
+        std::string report, chunk;
+
+        report = _spf(isColors
+               ? MAGENTA("GPU #%d: ")
+               : "GPU #%d: "
+               , thread->index()
+        );
+        if (health.clock != MAXUINT32 && health.memClock != MAXUINT32) {
+            report += _spf(isColors
+                    ? MAGENTA_BOLD("%u") MAGENTA("/") MAGENTA_BOLD("%u MHz") " "
+                    : "%u/%u MHz "
+                    , health.clock, health.memClock
+            );
         }
-
-        if (m_controller->config()->isColors()) {
-            LOG_INFO("\x1B[00;35mGPU #%d: %s%uC\x1B[00;35m FAN \x1B[01m%u%%",
-                thread->index(), (temp < 45 ? "\x1B[01;32m" : (temp > 65 ? "\x1B[01;31m" : "\x1B[01;33m")), temp, health.fanSpeed);
+        if (health.power != MAXUINT32) {
+            report += _spf(isColors
+                    ? MAGENTA_BOLD("%uW")
+                    : "%uW"
+                    , health.power / 1000
+            );
+        }
+        if (health.temperature) {
+            if (isColors) {
+                if (health.temperature > thread->nvmlTempH())
+                    report += _spf(RED_BOLD("%uC"),health.temperature);
+                else if (health.temperature < thread->nvmlTempL())
+                    report += _spf(GREEN_BOLD("%uC"),health.temperature);
+                else
+                    report += _spf(YELLOW_BOLD("%uC"),health.temperature);
+            } else
+                report += _spf("%uC ", health.temperature);
         }
-        else {
-            LOG_INFO(" * GPU #%d: %uC FAN %u%%", thread->index(), health.temperature, health.fanSpeed);
+        if (health.fanSpeed != MAXUINT32) {
+            report += _spf(isColors
+                    ? "FAN " MAGENTA_BOLD("%u%%")
+                    : "FAN %u%%"
+                    , health.fanSpeed
+            );
         }
+        LOG_INFO("%s", report.c_str());
     }
 }