add min and max temp, refactore nvidia api

2024-04-21 17:33:21 +00:00
parent 74432b1929
commit d75f4a8431
7 changed files with 192 additions and 179 deletions
--- a/pkg/nvidia/nvidia.go
+++ b/pkg/nvidia/nvidia.go
@@ -3,32 +3,42 @@ package nvidia
 import (
 	"errors"
 	"fmt"
+	"log"
 	"os"
 	"strconv"
 	"strings"
+	"sync"
+	"time"

 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 )

 type (
-	Repository struct {
-		driverVersion string
-		cudaVersion   string
+	cache struct {
+		gpus map[string]GPU
+		mu   sync.RWMutex
+	}
+
+	GPUSummary struct {
+		UUID string
+		Name string
 	}

 	GPU struct {
-		Name  string `json:"name"`
-		UUID  string `json:"uuid"`
-		Index int    `json:"-"`
+		Name        string      `json:"name"`
+		UUID        string      `json:"uuid"`
+		Index       int         `json:"-"`
+		Temperature Temperature `json:"temperature"`
+		Utilization Utilization `json:"usage"`
+		Processes   []Process   `json:"processes"`
+		Memory      Memory      `json:"memory"`
+		Fans        []Fan       `json:"fans"`
 	}

-	GPUDetail struct {
-		GPU
-		CoreTemperature int         `json:"coreTemperature"`
-		Utilization     Utilization `json:"usage"`
-		Processes       []Process   `json:"processes"`
-		Memory          Memory      `json:"memory"`
-		Fans            []Fan       `json:"fans"`
+	Temperature struct {
+		Min     uint
+		Max     uint
+		Current uint
 	}

 	Memory struct {
@@ -57,9 +67,15 @@ type (
 	}
 )

-var instance *Repository
+var (
+	ErrInitialization = errors.New("unable to initialize NVML")
+	ErrDriverAPI      = errors.New("an error occured while querying the driver")
+	ErrNotFound       = errors.New("the gpu is not found")

-func (*Repository) Close() error {
+	c *cache
+)
+
+func Close() error {
 	ret := nvml.Shutdown()
 	if ret != nvml.SUCCESS {
 		return errors.New(nvml.ErrorString(ret))
@@ -67,60 +83,129 @@ func (*Repository) Close() error {
 	return nil
 }

-func New() *Repository {
-	if instance == nil {
-		ret := nvml.Init()
-		if ret != nvml.SUCCESS {
-			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
-		}
-		driverVersion, ret := nvml.SystemGetDriverVersion()
-		if ret != nvml.SUCCESS {
-			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
-		}
-		cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
-		if ret != nvml.SUCCESS {
-			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
-		}
-		instance = &Repository{
-			driverVersion: driverVersion,
-			cudaVersion:   parseCUDA(cudaVersion),
-		}
+func RunDaemon() {
+	c = &cache{
+		gpus: make(map[string]GPU),
 	}
-	return instance
+	ret := nvml.Init()
+	if ret != nvml.SUCCESS {
+		log.Println("[ERROR]", nvml.ErrorString(ret))
+		return
+	}
+	go func() {
+		for {
+			time.Sleep(1 * time.Second)
+			update()
+		}
+	}()
 }

-func (*Repository) GetGPUs() ([]GPU, error) {
+func update() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
 	count, ret := nvml.DeviceGetCount()
 	if ret != nvml.SUCCESS {
-		return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
+		log.Println("[ERROR]", nvml.ErrorString(ret))
+		return
 	}
-	gpus := make([]GPU, 0, count)
 	for i := 0; i < count; i++ {
-		gpu, _, err := getGPU(i)
+		gpu, err := query(i)
 		if err != nil {
-			return nil, err
+			log.Println("[ERROR]", err)
+			return
 		}
-		gpus = append(gpus, gpu)
+		t := &gpu.Temperature
+		if g, ok := c.gpus[gpu.UUID]; ok {
+			if g.Temperature.Min > t.Current {
+				t.Min = t.Current
+			} else {
+				t.Min = g.Temperature.Min
+			}
+			if g.Temperature.Max < t.Current {
+				t.Max = t.Current
+			} else {
+				t.Max = g.Temperature.Max
+			}
+		} else {
+			t.Max = t.Current
+			t.Min = t.Current
+		}
+		c.gpus[gpu.UUID] = gpu
 	}
-	return gpus, nil
 }

-func (*Repository) GetGPU(ID int) (GPUDetail, error) {
-	gpu, device, err := getGPU(ID)
-	if err != nil {
-		return GPUDetail{}, err
+func GPUByUUID(uuid string) (GPU, error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	gpu, ok := c.gpus[uuid]
+	if !ok {
+		return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
+	}
+	return gpu, nil
+}
+
+func Summary() []GPUSummary {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	var res []GPUSummary
+	for _, gpu := range c.gpus {
+		res = append(res, GPUSummary{
+			UUID: gpu.UUID,
+			Name: gpu.Name,
+		})
+	}
+	return res
+}
+
+func DriverVersion() (string, error) {
+	driverVersion, ret := nvml.SystemGetDriverVersion()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
+	}
+	return driverVersion, nil
+}
+
+func CUDAVersion() (string, error) {
+	cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
+	}
+	return parseCUDA(cudaVersion), nil
+}
+
+func parseCUDA(version int) string {
+	major := (int)(version / 1000)
+	minor := (int)((version - (major * 1000)) / 10)
+
+	return fmt.Sprintf("%d.%d", major, minor)
+}
+
+func query(index int) (GPU, error) {
+	device, ret := nvml.DeviceGetHandleByIndex(index)
+	if ret != nvml.SUCCESS {
+		return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
+	}
+
+	name, ret := device.GetName()
+	if ret != nvml.SUCCESS {
+		return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
+	}
+
+	uuid, ret := device.GetUUID()
+	if ret != nvml.SUCCESS {
+		return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

 	fanCount, ret := device.GetNumFans()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

 	fans := make([]Fan, 0, fanCount)
 	for i := 0; i < fanCount; i++ {
 		fdev, ret := device.GetFanSpeed_v2(i)
 		if ret != nvml.SUCCESS {
-			return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+			return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
 		}
 		fan := Fan{
 			Speed: int(fdev),
@@ -130,30 +215,30 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {

 	temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

 	load, ret := device.GetUtilizationRates()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

 	decUsage, _, ret := device.GetDecoderUtilization()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}
 	encUsage, _, ret := device.GetEncoderUtilization()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

 	// Fetch all running process on the GPU
-	var allProcess []Process
+	allProcess := make([]Process, 0)

 	// Compute proc
 	proc, ret := device.GetComputeRunningProcesses()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}
 	for _, p := range proc {

@@ -169,7 +254,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
 	// Graphics/3D procs
 	proc, ret = device.GetGraphicsRunningProcesses()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}
 	for _, p := range proc {
 		sproc := Process{
@@ -184,7 +269,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
 	// MPS procs
 	proc, ret = device.GetMPSComputeRunningProcesses()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}
 	for _, p := range proc {
 		sproc := Process{
@@ -206,12 +291,15 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {

 	mem, ret := device.GetMemoryInfo_v2()
 	if ret != nvml.SUCCESS {
-		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
 	}

-	return GPUDetail{
-		GPU:             gpu,
-		CoreTemperature: int(temp),
+	return GPU{
+		Name: name,
+		UUID: uuid,
+		Temperature: Temperature{
+			Current: uint(temp),
+		},
 		Utilization: Utilization{
 			Decoder: int(decUsage),
 			Encoder: int(encUsage),
@@ -228,43 +316,3 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
 		Fans: fans,
 	}, nil
 }
-
-func (r *Repository) DriverVersion() string {
-	return r.driverVersion
-}
-
-func (r *Repository) CUDAVersion() string {
-	return r.cudaVersion
-}
-
-func parseCUDA(version int) string {
-	major := (int)(version / 1000)
-	minor := (int)((version - (major * 1000)) / 10)
-
-	return fmt.Sprintf("%d.%d", major, minor)
-}
-
-func getGPU(ID int) (GPU, nvml.Device, error) {
-	device, ret := nvml.DeviceGetHandleByIndex(ID)
-	if ret != nvml.SUCCESS {
-		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
-	}
-
-	name, ret := device.GetName()
-	if ret != nvml.SUCCESS {
-		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
-	}
-
-	uuid, ret := device.GetUUID()
-	if ret != nvml.SUCCESS {
-		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
-	}
-
-	gpu := GPU{
-		Name:  name,
-		UUID:  uuid,
-		Index: ID,
-	}
-
-	return gpu, device, nil
-}