Files

319 lines
7.1 KiB
Go

package nvidia
import (
"errors"
"fmt"
"log"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type (
cache struct {
gpus map[string]GPU
mu sync.RWMutex
}
GPUSummary struct {
UUID string
Name string
}
GPU struct {
Name string `json:"name"`
UUID string `json:"uuid"`
Index int `json:"-"`
Temperature Temperature `json:"temperature"`
Utilization Utilization `json:"usage"`
Processes []Process `json:"processes"`
Memory Memory `json:"memory"`
Fans []Fan `json:"fans"`
}
Temperature struct {
Min uint
Max uint
Current uint
}
Memory struct {
Free int `json:"free"`
Reserved int `json:"reserved"`
Total int `json:"total"`
Used int `json:"used"`
Version int `json:"-"`
}
Process struct {
PID int `json:"pid"`
MemoryUsed int `json:"memoryUsed"`
Name string `json:"commandLine"`
Type string `json:"type"`
}
Utilization struct {
Decoder int `json:"decoder"`
Encoder int `json:"encoder"`
Compute int `json:"compute"`
}
Fan struct {
Speed int `json:"speed"`
}
)
var (
ErrInitialization = errors.New("unable to initialize NVML")
ErrDriverAPI = errors.New("an error occured while querying the driver")
ErrNotFound = errors.New("the gpu is not found")
c *cache
)
func Close() error {
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}
func RunDaemon() {
c = &cache{
gpus: make(map[string]GPU),
}
ret := nvml.Init()
if ret != nvml.SUCCESS {
log.Println("[ERROR]", nvml.ErrorString(ret))
return
}
go func() {
for {
time.Sleep(1 * time.Second)
update()
}
}()
}
func update() {
c.mu.Lock()
defer c.mu.Unlock()
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
log.Println("[ERROR]", nvml.ErrorString(ret))
return
}
for i := 0; i < count; i++ {
gpu, err := query(i)
if err != nil {
log.Println("[ERROR]", err)
return
}
t := &gpu.Temperature
if g, ok := c.gpus[gpu.UUID]; ok {
if g.Temperature.Min > t.Current {
t.Min = t.Current
} else {
t.Min = g.Temperature.Min
}
if g.Temperature.Max < t.Current {
t.Max = t.Current
} else {
t.Max = g.Temperature.Max
}
} else {
t.Max = t.Current
t.Min = t.Current
}
c.gpus[gpu.UUID] = gpu
}
}
func GPUByUUID(uuid string) (GPU, error) {
c.mu.RLock()
defer c.mu.RUnlock()
gpu, ok := c.gpus[uuid]
if !ok {
return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
}
return gpu, nil
}
func Summary() []GPUSummary {
c.mu.RLock()
defer c.mu.RUnlock()
var res []GPUSummary
for _, gpu := range c.gpus {
res = append(res, GPUSummary{
UUID: gpu.UUID,
Name: gpu.Name,
})
}
return res
}
func DriverVersion() (string, error) {
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
}
return driverVersion, nil
}
func CUDAVersion() (string, error) {
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
}
return parseCUDA(cudaVersion), nil
}
func parseCUDA(version int) string {
major := (int)(version / 1000)
minor := (int)((version - (major * 1000)) / 10)
return fmt.Sprintf("%d.%d", major, minor)
}
func query(index int) (GPU, error) {
device, ret := nvml.DeviceGetHandleByIndex(index)
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fanCount, ret := device.GetNumFans()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fans := make([]Fan, 0, fanCount)
for i := 0; i < fanCount; i++ {
fdev, ret := device.GetFanSpeed_v2(i)
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
fan := Fan{
Speed: int(fdev),
}
fans = append(fans, fan)
}
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
load, ret := device.GetUtilizationRates()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
decUsage, _, ret := device.GetDecoderUtilization()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
encUsage, _, ret := device.GetEncoderUtilization()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
// Fetch all running process on the GPU
allProcess := make([]Process, 0)
// Compute proc
proc, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Compute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Graphics/3D procs
proc, ret = device.GetGraphicsRunningProcesses()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Graphics",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// MPS procs
proc, ret = device.GetMPSComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "MPSCompute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Find process command line
for i, process := range allProcess {
if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ")
allProcess[i] = process
}
}
mem, ret := device.GetMemoryInfo_v2()
if ret != nvml.SUCCESS {
return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
}
return GPU{
Name: name,
UUID: uuid,
Temperature: Temperature{
Current: uint(temp),
},
Utilization: Utilization{
Decoder: int(decUsage),
Encoder: int(encUsage),
Compute: int(load.Gpu),
},
Processes: allProcess,
Memory: Memory{
Free: int(mem.Free),
Reserved: int(mem.Reserved),
Total: int(mem.Total),
Used: int(mem.Used),
Version: int(mem.Version),
},
Fans: fans,
}, nil
}