nvidia-web-dashboard/pkg/nvidia/nvidia.go

package nvidia

import (
	"errors"
	"fmt"
	"os"
	"strconv"

	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

type (
	Repository struct {
		driverVersion string
		cudaVersion   string
	}

	GPU struct {
		Name  string
		UUID  string
		Index int
	}

	GPUDetail struct {
		GPU
		CoreTemperature int
		Utilization     Utilization
		Processes       []Process
		Memory          Memory
		Fans            []Fan
	}

	Memory struct {
		Free     int
		Reserved int
		Total    int
		Used     int
		Version  int
	}

	Process struct {
		PID        int
		MemoryUsed int
		Name       string
		Type       string
	}

	Utilization struct {
		Decode int
		Encode int
		Rate   int
	}

	Fan struct {
		Speed int
	}
)

var instance *Repository

func (*Repository) Close() error {
	ret := nvml.Shutdown()
	if ret != nvml.SUCCESS {
		return errors.New(nvml.ErrorString(ret))
	}
	return nil
}

func New() *Repository {
	if instance == nil {
		ret := nvml.Init()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		driverVersion, ret := nvml.SystemGetDriverVersion()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
		if ret != nvml.SUCCESS {
			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
		}
		instance = &Repository{
			driverVersion: driverVersion,
			cudaVersion:   parseCUDA(cudaVersion),
		}
	}
	return instance
}

func (*Repository) GetGPUs() ([]GPU, error) {
	count, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
	}
	gpus := make([]GPU, 0, count)
	for i := 0; i < count; i++ {
		gpu, _, err := getGPU(i)
		if err != nil {
			return nil, err
		}
		gpus = append(gpus, gpu)
	}
	return gpus, nil
}

func (*Repository) GetGPU(ID int) (GPUDetail, error) {
	gpu, device, err := getGPU(ID)
	if err != nil {
		return GPUDetail{}, err
	}

	fanCount, ret := device.GetNumFans()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	fans := make([]Fan, 0, fanCount)
	for i := 0; i < fanCount; i++ {
		fdev, ret := device.GetFanSpeed_v2(i)
		if ret != nvml.SUCCESS {
			return GPUDetail{}, errors.New(nvml.ErrorString(ret))
		}
		fan := Fan{
			Speed: int(fdev),
		}
		fans = append(fans, fan)
	}

	temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	load, ret := device.GetUtilizationRates()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	decUsage, _, ret := device.GetDecoderUtilization()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	encUsage, _, ret := device.GetEncoderUtilization()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	// Fetch all running process on the GPU
	var allProcess []Process

	// Compute proc
	proc, ret := device.GetComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {

		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Compute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Graphics/3D procs
	proc, ret = device.GetGraphicsRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "Graphics",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// MPS procs
	proc, ret = device.GetMPSComputeRunningProcesses()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}
	for _, p := range proc {
		sproc := Process{
			PID:        int(p.Pid),
			MemoryUsed: int(p.UsedGpuMemory),
			Type:       "MPSCompute",
			Name:       "n/a",
		}
		allProcess = append(allProcess, sproc)
	}

	// Find process command line
	for _, process := range allProcess {
		if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
			process.Name = string(cmdline)
		}
	}

	mem, ret := device.GetMemoryInfo_v2()
	if ret != nvml.SUCCESS {
		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
	}

	return GPUDetail{
		GPU:             gpu,
		CoreTemperature: int(temp),
		Utilization: Utilization{
			Decode: int(decUsage),
			Encode: int(encUsage),
			Rate:   int(load.Gpu),
		},
		Processes: allProcess,
		Memory: Memory{
			Free:     int(mem.Free),
			Reserved: int(mem.Reserved),
			Total:    int(mem.Total),
			Used:     int(mem.Used),
			Version:  int(mem.Version),
		},
		Fans: fans,
	}, nil
}

func (r *Repository) DriverVersion() string {
	return r.driverVersion
}

func (r *Repository) CUDAVersion() string {
	return r.cudaVersion
}

func parseCUDA(version int) string {
	major := (int)(version / 1000)
	minor := (int)((version - (major * 1000)) / 10)

	return fmt.Sprintf("%d.%d", major, minor)
}

func getGPU(ID int) (GPU, nvml.Device, error) {
	device, ret := nvml.DeviceGetHandleByIndex(ID)
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	name, ret := device.GetName()
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	uuid, ret := device.GetUUID()
	if ret != nvml.SUCCESS {
		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
	}

	gpu := GPU{
		Name:  name,
		UUID:  uuid,
		Index: ID,
	}

	return gpu, device, nil
}