Start project

2024-02-23 20:24:27 +01:00
commit de91d6f3e8
7 changed files with 533 additions and 0 deletions
--- a/pkg/nvidia/nvidia.go
+++ b/pkg/nvidia/nvidia.go
@@ -0,0 +1,268 @@
+package nvidia
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+type (
+	Repository struct {
+		driverVersion string
+		cudaVersion   string
+	}
+
+	GPU struct {
+		Name  string
+		UUID  string
+		Index int
+	}
+
+	GPUDetail struct {
+		GPU
+		CoreTemperature int
+		Utilization     Utilization
+		Processes       []Process
+		Memory          Memory
+		Fans            []Fan
+	}
+
+	Memory struct {
+		Free     int
+		Reserved int
+		Total    int
+		Used     int
+		Version  int
+	}
+
+	Process struct {
+		PID        int
+		MemoryUsed int
+		Name       string
+		Type       string
+	}
+
+	Utilization struct {
+		Decode int
+		Encode int
+		Rate   int
+	}
+
+	Fan struct {
+		Speed int
+	}
+)
+
+var instance *Repository
+
+func (*Repository) Close() error {
+	ret := nvml.Shutdown()
+	if ret != nvml.SUCCESS {
+		return errors.New(nvml.ErrorString(ret))
+	}
+	return nil
+}
+
+func New() *Repository {
+	if instance == nil {
+		ret := nvml.Init()
+		if ret != nvml.SUCCESS {
+			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
+		}
+		driverVersion, ret := nvml.SystemGetDriverVersion()
+		if ret != nvml.SUCCESS {
+			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
+		}
+		cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
+		if ret != nvml.SUCCESS {
+			panic("unable to initialize NVML: " + nvml.ErrorString(ret))
+		}
+		instance = &Repository{
+			driverVersion: driverVersion,
+			cudaVersion:   parseCUDA(cudaVersion),
+		}
+	}
+	return instance
+}
+
+func (*Repository) GetGPUs() ([]GPU, error) {
+	count, ret := nvml.DeviceGetCount()
+	if ret != nvml.SUCCESS {
+		return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
+	}
+	gpus := make([]GPU, 0, count)
+	for i := 0; i < count; i++ {
+		gpu, _, err := getGPU(i)
+		if err != nil {
+			return nil, err
+		}
+		gpus = append(gpus, gpu)
+	}
+	return gpus, nil
+}
+
+func (*Repository) GetGPU(ID int) (GPUDetail, error) {
+	gpu, device, err := getGPU(ID)
+	if err != nil {
+		return GPUDetail{}, err
+	}
+
+	fanCount, ret := device.GetNumFans()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	fans := make([]Fan, 0, fanCount)
+	for i := 0; i < fanCount; i++ {
+		fdev, ret := device.GetFanSpeed_v2(i)
+		if ret != nvml.SUCCESS {
+			return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+		}
+		fan := Fan{
+			Speed: int(fdev),
+		}
+		fans = append(fans, fan)
+	}
+
+	temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	load, ret := device.GetUtilizationRates()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	decUsage, _, ret := device.GetDecoderUtilization()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+	encUsage, _, ret := device.GetEncoderUtilization()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	// Fetch all running process on the GPU
+	var allProcess []Process
+
+	// Compute proc
+	proc, ret := device.GetComputeRunningProcesses()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+	for _, p := range proc {
+
+		sproc := Process{
+			PID:        int(p.Pid),
+			MemoryUsed: int(p.UsedGpuMemory),
+			Type:       "Compute",
+			Name:       "n/a",
+		}
+		allProcess = append(allProcess, sproc)
+	}
+
+	// Graphics/3D procs
+	proc, ret = device.GetGraphicsRunningProcesses()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+	for _, p := range proc {
+		sproc := Process{
+			PID:        int(p.Pid),
+			MemoryUsed: int(p.UsedGpuMemory),
+			Type:       "Graphics",
+			Name:       "n/a",
+		}
+		allProcess = append(allProcess, sproc)
+	}
+
+	// MPS procs
+	proc, ret = device.GetMPSComputeRunningProcesses()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+	for _, p := range proc {
+		sproc := Process{
+			PID:        int(p.Pid),
+			MemoryUsed: int(p.UsedGpuMemory),
+			Type:       "MPSCompute",
+			Name:       "n/a",
+		}
+		allProcess = append(allProcess, sproc)
+	}
+
+	// Find process command line
+	for _, process := range allProcess {
+		if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
+			process.Name = string(cmdline)
+		}
+	}
+
+	mem, ret := device.GetMemoryInfo_v2()
+	if ret != nvml.SUCCESS {
+		return GPUDetail{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	return GPUDetail{
+		GPU:             gpu,
+		CoreTemperature: int(temp),
+		Utilization: Utilization{
+			Decode: int(decUsage),
+			Encode: int(encUsage),
+			Rate:   int(load.Gpu),
+		},
+		Processes: allProcess,
+		Memory: Memory{
+			Free:     int(mem.Free),
+			Reserved: int(mem.Reserved),
+			Total:    int(mem.Total),
+			Used:     int(mem.Used),
+			Version:  int(mem.Version),
+		},
+		Fans: fans,
+	}, nil
+}
+
+func (r *Repository) DriverVersion() string {
+	return r.driverVersion
+}
+
+func (r *Repository) CUDAVersion() string {
+	return r.cudaVersion
+}
+
+func parseCUDA(version int) string {
+	major := (int)(version / 1000)
+	minor := (int)((version - (major * 1000)) / 10)
+
+	return fmt.Sprintf("%d.%d", major, minor)
+}
+
+func getGPU(ID int) (GPU, nvml.Device, error) {
+	device, ret := nvml.DeviceGetHandleByIndex(ID)
+	if ret != nvml.SUCCESS {
+		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	name, ret := device.GetName()
+	if ret != nvml.SUCCESS {
+		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	uuid, ret := device.GetUUID()
+	if ret != nvml.SUCCESS {
+		return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
+	}
+
+	gpu := GPU{
+		Name:  name,
+		UUID:  uuid,
+		Index: ID,
+	}
+
+	return gpu, device, nil
+}