nvidia-web-dashboard/pkg/nvidia/nvidia.go

package nvidia

import (
	"errors"
	"fmt"
	"log"
	"os"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type (
	cache struct {
		gpus map[string]GPU
		mu   sync.RWMutex
	}

	GPUSummary struct {
		UUID string
		Name string
	}

	GPU struct {
		Name        string      `json:"name"`
		UUID        string      `json:"uuid"`
		Index       int         `json:"-"`
		Temperature Temperature `json:"temperature"`
		Utilization Utilization `json:"usage"`
		Processes   []Process   `json:"processes"`
		Memory      Memory      `json:"memory"`
		Fans        []Fan       `json:"fans"`
	}

	Temperature struct {
		Min     uint
		Max     uint
		Current uint
	}

	Memory struct {
		Free     int `json:"free"`
		Reserved int `json:"reserved"`
		Total    int `json:"total"`
		Used     int `json:"used"`
		Version  int `json:"-"`
	}

	Process struct {
		PID        int    `json:"pid"`
		MemoryUsed int    `json:"memoryUsed"`
		Name       string `json:"commandLine"`
		Type       string `json:"type"`
	}

	Utilization struct {
		Decoder int `json:"decoder"`
		Encoder int `json:"encoder"`
		Compute int `json:"compute"`
	}

	Fan struct {
		Speed int `json:"speed"`
	}
)

var (
	ErrInitialization = errors.New("unable to initialize NVML")
	ErrDriverAPI      = errors.New("an error occured while querying the driver")
	ErrNotFound       = errors.New("the gpu is not found")

	c *cache
)

func Close() error {
	ret := nvml.Shutdown()
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	return nil
}

func RunDaemon() {
	c = &cache{
		gpus: make(map[string]GPU),
	}
	ret := nvml.Init()
	if ret != nvml.SUCCESS {
		log.Println("[ERROR]", nvml.ErrorString(ret))
		return
	}
	go func() {
		for {
			time.Sleep(1 * time.Second)
			update()
		}
	}()
}

func update() {
	c.mu.Lock()
	defer c.mu.Unlock()
	count, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		log.Println("[ERROR]", nvml.ErrorString(ret))
		return
	}
	for i := 0; i < count; i++ {
		gpu, err := query(i)
		if err != nil {
			log.Println("[ERROR]", err)
			return
		}
		t := &gpu.Temperature
		if g, ok := c.gpus[gpu.UUID]; ok {
			if g.Temperature.Min > t.Current {
				t.Min = t.Current
			} else {
				t.Min = g.Temperature.Min
			}
			if g.Temperature.Max < t.Current {
				t.Max = t.Current
			} else {
				t.Max = g.Temperature.Max
			}
		} else {
			t.Max = t.Current
			t.Min = t.Current
		}
		c.gpus[gpu.UUID] = gpu
	}
}

func GPUByUUID(uuid string) (GPU, error) {
	c.mu.RLock()
	defer c.mu.RUnlock()
	gpu, ok := c.gpus[uuid]
	if !ok {
		return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
	}
	return gpu, nil
}

func Summary() []GPUSummary {
	c.mu.RLock()
	defer c.mu.RUnlock()
	var res []GPUSummary
	for _, gpu := range c.gpus {
		res = append(res, GPUSummary{
			UUID: gpu.UUID,
			Name: gpu.Name,
		})
	}
	return res
}

func DriverVersion() (string, error) {
	driverVersion, ret := nvml.SystemGetDriverVersion()
	if ret != nvml.SUCCESS {
		return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
	}
	return driverVersion, nil
}

func CUDAVersion() (string, error) {
	cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
	if ret != nvml.SUCCESS {
		return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
	}
	return parseCUDA(cudaVersion), nil
}

func parseCUDA(version int) string {
	major := (int)(version / 1000)
	minor := (int)((version - (major * 1000)) / 10)

	return fmt.Sprintf("%d.%d", major, minor)
}

func query(index int) (GPU, error) {
	device, ret := nvml.DeviceGetHandleByIndex(index)
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	name, ret := device.GetName()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	uuid, ret := device.GetUUID()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	fanCount, ret := device.GetNumFans()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	fans := make([]Fan, 0, fanCount)
	for i := 0; i < fanCount; i++ {
		fdev, ret := device.GetFanSpeed_v2(i)
		if ret != nvml.SUCCESS {
			return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
		}
		fan := Fan{
			Speed: int(fdev),
		}
		fans = append(fans, fan)
	}

	temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	load, ret := device.GetUtilizationRates()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	decUsage, _, ret := device.GetDecoderUtilization()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}
	encUsage, _, ret := device.GetEncoderUtilization()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	// Fetch all running process on the GPU
	allProcess := make([]Process, 0)

	// Compute proc
	proc, ret := device.GetComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}
	for _, p := range proc {

		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Compute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Graphics/3D procs
	proc, ret = device.GetGraphicsRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Graphics",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// MPS procs
	proc, ret = device.GetMPSComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "MPSCompute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Find process command line
	for i, process := range allProcess {
		if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
			process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ")
			allProcess[i] = process
		}
	}

	mem, ret := device.GetMemoryInfo_v2()
	if ret != nvml.SUCCESS {
		return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
	}

	return GPU{
		Name: name,
		UUID: uuid,
		Temperature: Temperature{
			Current: uint(temp),
		},
		Utilization: Utilization{
			Decoder: int(decUsage),
			Encoder: int(encUsage),
			Compute: int(load.Gpu),
		},
		Processes: allProcess,
		Memory: Memory{
			Free:     int(mem.Free),
			Reserved: int(mem.Reserved),
			Total:    int(mem.Total),
			Used:     int(mem.Used),
			Version:  int(mem.Version),
		},
		Fans: fans,
	}, nil
}