add min and max temp, refactore nvidia api

This commit is contained in:
2024-04-21 17:33:21 +00:00
parent 74432b1929
commit d75f4a8431
7 changed files with 192 additions and 179 deletions

View File

@@ -3,32 +3,42 @@ package nvidia
import (
"errors"
"fmt"
"log"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type (
Repository struct {
driverVersion string
cudaVersion string
cache struct {
gpus map[string]GPU
mu sync.RWMutex
}
GPUSummary struct {
UUID string
Name string
}
GPU struct {
Name string `json:"name"`
UUID string `json:"uuid"`
Index int `json:"-"`
Name string `json:"name"`
UUID string `json:"uuid"`
Index int `json:"-"`
Temperature Temperature `json:"temperature"`
Utilization Utilization `json:"usage"`
Processes []Process `json:"processes"`
Memory Memory `json:"memory"`
Fans []Fan `json:"fans"`
}
GPUDetail struct {
GPU
CoreTemperature int `json:"coreTemperature"`
Utilization Utilization `json:"usage"`
Processes []Process `json:"processes"`
Memory Memory `json:"memory"`
Fans []Fan `json:"fans"`
Temperature struct {
Min uint
Max uint
Current uint
}
Memory struct {
@@ -57,9 +67,15 @@ type (
}
)
var instance *Repository
var (
ErrInitialization = errors.New("unable to initialize NVML")
ErrDriverAPI = errors.New("an error occured while querying the driver")
ErrNotFound = errors.New("the gpu is not found")
func (*Repository) Close() error {
c *cache
)
func Close() error {
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
@@ -67,60 +83,129 @@ func (*Repository) Close() error {
return nil
}
func New() *Repository {
if instance == nil {
ret := nvml.Init()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
instance = &Repository{
driverVersion: driverVersion,
cudaVersion: parseCUDA(cudaVersion),
}
func RunDaemon() {
c = &cache{
gpus: make(map[string]GPU),
}
return instance
ret := nvml.Init()
if ret != nvml.SUCCESS {
log.Println("[ERROR]", nvml.ErrorString(ret))
return
}
go func() {
for {
time.Sleep(1 * time.Second)
update()
}
}()
}
func (*Repository) GetGPUs() ([]GPU, error) {
func update() {
c.mu.Lock()
defer c.mu.Unlock()
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
log.Println("[ERROR]", nvml.ErrorString(ret))
return
}
gpus := make([]GPU, 0, count)
for i := 0; i < count; i++ {
gpu, _, err := getGPU(i)
gpu, err := query(i)
if err != nil {
return nil, err
log.Println("[ERROR]", err)
return
}
gpus = append(gpus, gpu)
t := &gpu.Temperature
if g, ok := c.gpus[gpu.UUID]; ok {
if g.Temperature.Min > t.Current {
t.Min = t.Current
} else {
t.Min = g.Temperature.Min
}
if g.Temperature.Max < t.Current {
t.Max = t.Current
} else {
t.Max = g.Temperature.Max
}
} else {
t.Max = t.Current
t.Min = t.Current
}
c.gpus[gpu.UUID] = gpu
}
return gpus, nil
}
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
gpu, device, err := getGPU(ID)
if err != nil {
return GPUDetail{}, err
func GPUByUUID(uuid string) (GPU, error) {
c.mu.RLock()
defer c.mu.RUnlock()
gpu, ok := c.gpus[uuid]
if !ok {
return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
}
return gpu, nil
}
func Summary() []GPUSummary {
c.mu.RLock()
defer c.mu.RUnlock()
var res []GPUSummary
for _, gpu := range c.gpus {
res = append(res, GPUSummary{
UUID: gpu.UUID,
Name: gpu.Name,
})
}
return res
}
func DriverVersion() (string, error) {
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
}
return driverVersion, nil
}
func CUDAVersion() (string, error) {
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
}
return parseCUDA(cudaVersion), nil
}
func parseCUDA(version int) string {
major := (int)(version / 1000)
minor := (int)((version - (major * 1000)) / 10)
return fmt.Sprintf("%d.%d", major, minor)
}
func query(index int) (GPU, error) {
device, ret := nvml.DeviceGetHandleByIndex(index)
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fanCount, ret := device.GetNumFans()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fans := make([]Fan, 0, fanCount)
for i := 0; i < fanCount; i++ {
fdev, ret := device.GetFanSpeed_v2(i)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fan := Fan{
Speed: int(fdev),
@@ -130,30 +215,30 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
load, ret := device.GetUtilizationRates()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
decUsage, _, ret := device.GetDecoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
encUsage, _, ret := device.GetEncoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
// Fetch all running process on the GPU
var allProcess []Process
allProcess := make([]Process, 0)
// Compute proc
proc, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
@@ -169,7 +254,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
// Graphics/3D procs
proc, ret = device.GetGraphicsRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
@@ -184,7 +269,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
// MPS procs
proc, ret = device.GetMPSComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
@@ -206,12 +291,15 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
mem, ret := device.GetMemoryInfo_v2()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
return GPUDetail{
GPU: gpu,
CoreTemperature: int(temp),
return GPU{
Name: name,
UUID: uuid,
Temperature: Temperature{
Current: uint(temp),
},
Utilization: Utilization{
Decoder: int(decUsage),
Encoder: int(encUsage),
@@ -228,43 +316,3 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
Fans: fans,
}, nil
}
func (r *Repository) DriverVersion() string {
return r.driverVersion
}
func (r *Repository) CUDAVersion() string {
return r.cudaVersion
}
func parseCUDA(version int) string {
major := (int)(version / 1000)
minor := (int)((version - (major * 1000)) / 10)
return fmt.Sprintf("%d.%d", major, minor)
}
func getGPU(ID int) (GPU, nvml.Device, error) {
device, ret := nvml.DeviceGetHandleByIndex(ID)
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
gpu := GPU{
Name: name,
UUID: uuid,
Index: ID,
}
return gpu, device, nil
}