package nvidia import ( "errors" "fmt" "os" "strconv" "github.com/NVIDIA/go-nvml/pkg/nvml" ) type ( Repository struct { driverVersion string cudaVersion string } GPU struct { Name string UUID string Index int } GPUDetail struct { GPU CoreTemperature int Utilization Utilization Processes []Process Memory Memory Fans []Fan } Memory struct { Free int Reserved int Total int Used int Version int } Process struct { PID int MemoryUsed int Name string Type string } Utilization struct { Decode int Encode int Rate int } Fan struct { Speed int } ) var instance *Repository func (*Repository) Close() error { ret := nvml.Shutdown() if ret != nvml.SUCCESS { return errors.New(nvml.ErrorString(ret)) } return nil } func New() *Repository { if instance == nil { ret := nvml.Init() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } driverVersion, ret := nvml.SystemGetDriverVersion() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } instance = &Repository{ driverVersion: driverVersion, cudaVersion: parseCUDA(cudaVersion), } } return instance } func (*Repository) GetGPUs() ([]GPU, error) { count, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret)) } gpus := make([]GPU, 0, count) for i := 0; i < count; i++ { gpu, _, err := getGPU(i) if err != nil { return nil, err } gpus = append(gpus, gpu) } return gpus, nil } func (*Repository) GetGPU(ID int) (GPUDetail, error) { gpu, device, err := getGPU(ID) if err != nil { return GPUDetail{}, err } fanCount, ret := device.GetNumFans() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } fans := make([]Fan, 0, fanCount) for i := 0; i < fanCount; i++ { fdev, ret := device.GetFanSpeed_v2(i) if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } fan := Fan{ Speed: int(fdev), } fans = append(fans, fan) } temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU) if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } load, ret := device.GetUtilizationRates() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } decUsage, _, ret := device.GetDecoderUtilization() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } encUsage, _, ret := device.GetEncoderUtilization() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } // Fetch all running process on the GPU var allProcess []Process // Compute proc proc, ret := device.GetComputeRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Compute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Graphics/3D procs proc, ret = device.GetGraphicsRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Graphics", Name: "n/a", } allProcess = append(allProcess, sproc) } // MPS procs proc, ret = device.GetMPSComputeRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "MPSCompute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Find process command line for _, process := range allProcess { if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil { process.Name = string(cmdline) } } mem, ret := device.GetMemoryInfo_v2() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } return GPUDetail{ GPU: gpu, CoreTemperature: int(temp), Utilization: Utilization{ Decode: int(decUsage), Encode: int(encUsage), Rate: int(load.Gpu), }, Processes: allProcess, Memory: Memory{ Free: int(mem.Free), Reserved: int(mem.Reserved), Total: int(mem.Total), Used: int(mem.Used), Version: int(mem.Version), }, Fans: fans, }, nil } func (r *Repository) DriverVersion() string { return r.driverVersion } func (r *Repository) CUDAVersion() string { return r.cudaVersion } func parseCUDA(version int) string { major := (int)(version / 1000) minor := (int)((version - (major * 1000)) / 10) return fmt.Sprintf("%d.%d", major, minor) } func getGPU(ID int) (GPU, nvml.Device, error) { device, ret := nvml.DeviceGetHandleByIndex(ID) if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } name, ret := device.GetName() if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } uuid, ret := device.GetUUID() if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } gpu := GPU{ Name: name, UUID: uuid, Index: ID, } return gpu, device, nil }