nvidia-web-dashboard/pkg/nvidia/nvidia.go

package nvidia

import (
	"errors"
	"fmt"
	"os"
	"strconv"
	"strings"

	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type (
	Repository struct {
		driverVersion string
		cudaVersion   string
	}

	GPU struct {
		Name  string `json:"name"`
		UUID  string `json:"uuid"`
		Index int    `json:"-"`
	}

	GPUDetail struct {
		GPU
		CoreTemperature int         `json:"coreTemperature"`
		Utilization     Utilization `json:"usage"`
		Processes       []Process   `json:"processes"`
		Memory          Memory      `json:"memory"`
		Fans            []Fan       `json:"fans"`
	}

	Memory struct {
		Free     int `json:"free"`
		Reserved int `json:"reserved"`
		Total    int `json:"total"`
		Used     int `json:"used"`
		Version  int `json:"-"`
	}

	Process struct {
		PID        int    `json:"pid"`
		MemoryUsed int    `json:"memoryUsed"`
		Name       string `json:"commandLine"`
		Type       string `json:"type"`
	}

	Utilization struct {
		Decoder int `json:"decoder"`
		Encoder int `json:"encoder"`
		Compute int `json:"compute"`
	}

	Fan struct {
		Speed int `json:"speed"`
	}
)

var instance *Repository

func (*Repository) Close() error {
	ret := nvml.Shutdown()
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	return nil
}

func New() *Repository {
	if instance == nil {
		ret := nvml.Init()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		driverVersion, ret := nvml.SystemGetDriverVersion()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		instance = &Repository{
			driverVersion: driverVersion,
			cudaVersion:   parseCUDA(cudaVersion),
		}
	}
	return instance
}

func (*Repository) GetGPUs() ([]GPU, error) {
	count, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
	}
	gpus := make([]GPU, 0, count)
	for i := 0; i < count; i++ {
		gpu, _, err := getGPU(i)
		if err != nil {
			return nil, err
		}
		gpus = append(gpus, gpu)
	}
	return gpus, nil
}

func (*Repository) GetGPU(ID int) (GPUDetail, error) {
	gpu, device, err := getGPU(ID)
	if err != nil {
		return GPUDetail{}, err
	}

	fanCount, ret := device.GetNumFans()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	fans := make([]Fan, 0, fanCount)
	for i := 0; i < fanCount; i++ {
		fdev, ret := device.GetFanSpeed_v2(i)
		if ret != nvml.SUCCESS {
			return GPUDetail{}, errors.New(nvml.ErrorString(ret))
		}
		fan := Fan{
			Speed: int(fdev),
		}
		fans = append(fans, fan)
	}

	temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	load, ret := device.GetUtilizationRates()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	decUsage, _, ret := device.GetDecoderUtilization()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	encUsage, _, ret := device.GetEncoderUtilization()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	// Fetch all running process on the GPU
	var allProcess []Process

	// Compute proc
	proc, ret := device.GetComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {

		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Compute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Graphics/3D procs
	proc, ret = device.GetGraphicsRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Graphics",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// MPS procs
	proc, ret = device.GetMPSComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "MPSCompute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Find process command line
	for i, process := range allProcess {
		if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
			process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ")
			allProcess[i] = process
		}
	}

	mem, ret := device.GetMemoryInfo_v2()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	return GPUDetail{
		GPU:             gpu,
		CoreTemperature: int(temp),
		Utilization: Utilization{
			Decoder: int(decUsage),
			Encoder: int(encUsage),
			Compute: int(load.Gpu),
		},
		Processes: allProcess,
		Memory: Memory{
			Free:     int(mem.Free),
			Reserved: int(mem.Reserved),
			Total:    int(mem.Total),
			Used:     int(mem.Used),
			Version:  int(mem.Version),
		},
		Fans: fans,
	}, nil
}

func (r *Repository) DriverVersion() string {
	return r.driverVersion
}

func (r *Repository) CUDAVersion() string {
	return r.cudaVersion
}

func parseCUDA(version int) string {
	major := (int)(version / 1000)
	minor := (int)((version - (major * 1000)) / 10)

	return fmt.Sprintf("%d.%d", major, minor)
}

func getGPU(ID int) (GPU, nvml.Device, error) {
	device, ret := nvml.DeviceGetHandleByIndex(ID)
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	name, ret := device.GetName()
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	uuid, ret := device.GetUUID()
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	gpu := GPU{
		Name:  name,
		UUID:  uuid,
		Index: ID,
	}

	return gpu, device, nil
}