package nvidia import ( "errors" "fmt" "log" "os" "strconv" "strings" "sync" "time" "github.com/NVIDIA/go-nvml/pkg/nvml" ) type ( cache struct { gpus map[string]GPU mu sync.RWMutex } GPUSummary struct { UUID string Name string } GPU struct { Name string `json:"name"` UUID string `json:"uuid"` Index int `json:"-"` Temperature Temperature `json:"temperature"` Utilization Utilization `json:"usage"` Processes []Process `json:"processes"` Memory Memory `json:"memory"` Fans []Fan `json:"fans"` } Temperature struct { Min uint Max uint Current uint } Memory struct { Free int `json:"free"` Reserved int `json:"reserved"` Total int `json:"total"` Used int `json:"used"` Version int `json:"-"` } Process struct { PID int `json:"pid"` MemoryUsed int `json:"memoryUsed"` Name string `json:"commandLine"` Type string `json:"type"` } Utilization struct { Decoder int `json:"decoder"` Encoder int `json:"encoder"` Compute int `json:"compute"` } Fan struct { Speed int `json:"speed"` } ) var ( ErrInitialization = errors.New("unable to initialize NVML") ErrDriverAPI = errors.New("an error occured while querying the driver") ErrNotFound = errors.New("the gpu is not found") c *cache ) func Close() error { ret := nvml.Shutdown() if ret != nvml.SUCCESS { return errors.New(nvml.ErrorString(ret)) } return nil } func RunDaemon() { c = &cache{ gpus: make(map[string]GPU), } ret := nvml.Init() if ret != nvml.SUCCESS { log.Println("[ERROR]", nvml.ErrorString(ret)) return } go func() { for { time.Sleep(1 * time.Second) update() } }() } func update() { c.mu.Lock() defer c.mu.Unlock() count, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { log.Println("[ERROR]", nvml.ErrorString(ret)) return } for i := 0; i < count; i++ { gpu, err := query(i) if err != nil { log.Println("[ERROR]", err) return } t := &gpu.Temperature if g, ok := c.gpus[gpu.UUID]; ok { if g.Temperature.Min > t.Current { t.Min = t.Current } else { t.Min = g.Temperature.Min } if g.Temperature.Max < t.Current { t.Max = t.Current } else { t.Max = g.Temperature.Max } } else { t.Max = t.Current t.Min = t.Current } c.gpus[gpu.UUID] = gpu } } func GPUByUUID(uuid string) (GPU, error) { c.mu.RLock() defer c.mu.RUnlock() gpu, ok := c.gpus[uuid] if !ok { return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid) } return gpu, nil } func Summary() []GPUSummary { c.mu.RLock() defer c.mu.RUnlock() var res []GPUSummary for _, gpu := range c.gpus { res = append(res, GPUSummary{ UUID: gpu.UUID, Name: gpu.Name, }) } return res } func DriverVersion() (string, error) { driverVersion, ret := nvml.SystemGetDriverVersion() if ret != nvml.SUCCESS { return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret)) } return driverVersion, nil } func CUDAVersion() (string, error) { cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() if ret != nvml.SUCCESS { return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret)) } return parseCUDA(cudaVersion), nil } func parseCUDA(version int) string { major := (int)(version / 1000) minor := (int)((version - (major * 1000)) / 10) return fmt.Sprintf("%d.%d", major, minor) } func query(index int) (GPU, error) { device, ret := nvml.DeviceGetHandleByIndex(index) if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret)) } name, ret := device.GetName() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret)) } uuid, ret := device.GetUUID() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fanCount, ret := device.GetNumFans() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fans := make([]Fan, 0, fanCount) for i := 0; i < fanCount; i++ { fdev, ret := device.GetFanSpeed_v2(i) if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fan := Fan{ Speed: int(fdev), } fans = append(fans, fan) } temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU) if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret)) } load, ret := device.GetUtilizationRates() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret)) } decUsage, _, ret := device.GetDecoderUtilization() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret)) } encUsage, _, ret := device.GetEncoderUtilization() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret)) } // Fetch all running process on the GPU allProcess := make([]Process, 0) // Compute proc proc, ret := device.GetComputeRunningProcesses() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Compute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Graphics/3D procs proc, ret = device.GetGraphicsRunningProcesses() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "Graphics", Name: "n/a", } allProcess = append(allProcess, sproc) } // MPS procs proc, ret = device.GetMPSComputeRunningProcesses() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ PID: int(p.Pid), MemoryUsed: int(p.UsedGpuMemory), Type: "MPSCompute", Name: "n/a", } allProcess = append(allProcess, sproc) } // Find process command line for i, process := range allProcess { if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil { process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ") allProcess[i] = process } } mem, ret := device.GetMemoryInfo_v2() if ret != nvml.SUCCESS { return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret)) } return GPU{ Name: name, UUID: uuid, Temperature: Temperature{ Current: uint(temp), }, Utilization: Utilization{ Decoder: int(decUsage), Encoder: int(encUsage), Compute: int(load.Gpu), }, Processes: allProcess, Memory: Memory{ Free: int(mem.Free), Reserved: int(mem.Reserved), Total: int(mem.Total), Used: int(mem.Used), Version: int(mem.Version), }, Fans: fans, }, nil }