package nvidia import ( "errors" "fmt" "os" "strconv" "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" ) type ( Repository struct { driverVersion string cudaVersion string } GPU struct { Name string `json:"name"` UUID string `json:"uuid"` Index int `json:"-"` } GPUDetail struct { GPU CoreTemperature int `json:"coreTemperature"` Utilization Utilization `json:"usage"` Processes []Process `json:"processes"` Memory Memory `json:"memory"` Fans []Fan `json:"fans"` } Memory struct { Free int `json:"free"` Reserved int `json:"reserved"` Total int `json:"total"` Used int `json:"used"` Version int `json:"-"` } Process struct { PID int `json:"pid"` MemoryUsed int `json:"memoryUsed"` Name string `json:"commandLine"` Type string `json:"type"` } Utilization struct { Decoder int `json:"decoder"` Encoder int `json:"encoder"` Compute int `json:"compute"` } Fan struct { Speed int `json:"speed"` } ) var instance *Repository func (*Repository) Close() error { ret := nvml.Shutdown() if ret != nvml.SUCCESS { return errors.New(nvml.ErrorString(ret)) } return nil } func New() *Repository { if instance == nil { ret := nvml.Init() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } driverVersion, ret := nvml.SystemGetDriverVersion() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() if ret != nvml.SUCCESS { panic("unable to initialize NVML: " + nvml.ErrorString(ret)) } instance = &Repository{ driverVersion: driverVersion, cudaVersion: parseCUDA(cudaVersion), } } return instance } func (*Repository) GetGPUs() ([]GPU, error) { count, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret)) } gpus := make([]GPU, 0, count) for i := 0; i < count; i++ { gpu, _, err := getGPU(i) if err != nil { return nil, err } gpus = append(gpus, gpu) } return gpus, nil } func (*Repository) GetGPU(ID int) (GPUDetail, error) { gpu, device, err := getGPU(ID) if err != nil { return GPUDetail{}, err } fanCount, ret := device.GetNumFans() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } fans := make([]Fan, 0, fanCount) for i := 0; i < fanCount; i++ { fdev, ret := device.GetFanSpeed_v2(i) if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } fan := Fan{ Speed: int(fdev), } fans = append(fans, fan) } temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU) if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } load, ret := device.GetUtilizationRates() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } decUsage, _, ret := device.GetDecoderUtilization() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } encUsage, _, ret := device.GetEncoderUtilization() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } // Fetch all running process on the GPU var allProcess []Process // Compute proc proc, ret := device.GetComputeRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Compute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Graphics/3D procs proc, ret = device.GetGraphicsRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Graphics", Name: "n/a", } allProcess = append(allProcess, sproc) } // MPS procs proc, ret = device.GetMPSComputeRunningProcesses() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "MPSCompute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Find process command line for i, process := range allProcess { if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil { process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ") allProcess[i] = process } } mem, ret := device.GetMemoryInfo_v2() if ret != nvml.SUCCESS { return GPUDetail{}, errors.New(nvml.ErrorString(ret)) } return GPUDetail{ GPU: gpu, CoreTemperature: int(temp), Utilization: Utilization{ Decoder: int(decUsage), Encoder: int(encUsage), Compute: int(load.Gpu), }, Processes: allProcess, Memory: Memory{ Free: int(mem.Free), Reserved: int(mem.Reserved), Total: int(mem.Total), Used: int(mem.Used), Version: int(mem.Version), }, Fans: fans, }, nil } func (r *Repository) DriverVersion() string { return r.driverVersion } func (r *Repository) CUDAVersion() string { return r.cudaVersion } func parseCUDA(version int) string { major := (int)(version / 1000) minor := (int)((version - (major * 1000)) / 10) return fmt.Sprintf("%d.%d", major, minor) } func getGPU(ID int) (GPU, nvml.Device, error) { device, ret := nvml.DeviceGetHandleByIndex(ID) if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } name, ret := device.GetName() if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } uuid, ret := device.GetUUID() if ret != nvml.SUCCESS { return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) } gpu := GPU{ Name: name, UUID: uuid, Index: ID, } return gpu, device, nil }