319 lines
7.1 KiB
Go
319 lines
7.1 KiB
Go
package nvidia
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
|
)
|
|
|
|
type (
|
|
cache struct {
|
|
gpus map[string]GPU
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
GPUSummary struct {
|
|
UUID string
|
|
Name string
|
|
}
|
|
|
|
GPU struct {
|
|
Name string `json:"name"`
|
|
UUID string `json:"uuid"`
|
|
Index int `json:"-"`
|
|
Temperature Temperature `json:"temperature"`
|
|
Utilization Utilization `json:"usage"`
|
|
Processes []Process `json:"processes"`
|
|
Memory Memory `json:"memory"`
|
|
Fans []Fan `json:"fans"`
|
|
}
|
|
|
|
Temperature struct {
|
|
Min uint
|
|
Max uint
|
|
Current uint
|
|
}
|
|
|
|
Memory struct {
|
|
Free int `json:"free"`
|
|
Reserved int `json:"reserved"`
|
|
Total int `json:"total"`
|
|
Used int `json:"used"`
|
|
Version int `json:"-"`
|
|
}
|
|
|
|
Process struct {
|
|
PID int `json:"pid"`
|
|
MemoryUsed int `json:"memoryUsed"`
|
|
Name string `json:"commandLine"`
|
|
Type string `json:"type"`
|
|
}
|
|
|
|
Utilization struct {
|
|
Decoder int `json:"decoder"`
|
|
Encoder int `json:"encoder"`
|
|
Compute int `json:"compute"`
|
|
}
|
|
|
|
Fan struct {
|
|
Speed int `json:"speed"`
|
|
}
|
|
)
|
|
|
|
var (
|
|
ErrInitialization = errors.New("unable to initialize NVML")
|
|
ErrDriverAPI = errors.New("an error occured while querying the driver")
|
|
ErrNotFound = errors.New("the gpu is not found")
|
|
|
|
c *cache
|
|
)
|
|
|
|
func Close() error {
|
|
ret := nvml.Shutdown()
|
|
if ret != nvml.SUCCESS {
|
|
return errors.New(nvml.ErrorString(ret))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func RunDaemon() {
|
|
c = &cache{
|
|
gpus: make(map[string]GPU),
|
|
}
|
|
ret := nvml.Init()
|
|
if ret != nvml.SUCCESS {
|
|
log.Println("[ERROR]", nvml.ErrorString(ret))
|
|
return
|
|
}
|
|
go func() {
|
|
for {
|
|
time.Sleep(1 * time.Second)
|
|
update()
|
|
}
|
|
}()
|
|
}
|
|
|
|
func update() {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
count, ret := nvml.DeviceGetCount()
|
|
if ret != nvml.SUCCESS {
|
|
log.Println("[ERROR]", nvml.ErrorString(ret))
|
|
return
|
|
}
|
|
for i := 0; i < count; i++ {
|
|
gpu, err := query(i)
|
|
if err != nil {
|
|
log.Println("[ERROR]", err)
|
|
return
|
|
}
|
|
t := &gpu.Temperature
|
|
if g, ok := c.gpus[gpu.UUID]; ok {
|
|
if g.Temperature.Min > t.Current {
|
|
t.Min = t.Current
|
|
} else {
|
|
t.Min = g.Temperature.Min
|
|
}
|
|
if g.Temperature.Max < t.Current {
|
|
t.Max = t.Current
|
|
} else {
|
|
t.Max = g.Temperature.Max
|
|
}
|
|
} else {
|
|
t.Max = t.Current
|
|
t.Min = t.Current
|
|
}
|
|
c.gpus[gpu.UUID] = gpu
|
|
}
|
|
}
|
|
|
|
func GPUByUUID(uuid string) (GPU, error) {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
gpu, ok := c.gpus[uuid]
|
|
if !ok {
|
|
return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
|
|
}
|
|
return gpu, nil
|
|
}
|
|
|
|
func Summary() []GPUSummary {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
var res []GPUSummary
|
|
for _, gpu := range c.gpus {
|
|
res = append(res, GPUSummary{
|
|
UUID: gpu.UUID,
|
|
Name: gpu.Name,
|
|
})
|
|
}
|
|
return res
|
|
}
|
|
|
|
func DriverVersion() (string, error) {
|
|
driverVersion, ret := nvml.SystemGetDriverVersion()
|
|
if ret != nvml.SUCCESS {
|
|
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
|
}
|
|
return driverVersion, nil
|
|
}
|
|
|
|
func CUDAVersion() (string, error) {
|
|
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
|
if ret != nvml.SUCCESS {
|
|
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
|
}
|
|
return parseCUDA(cudaVersion), nil
|
|
}
|
|
|
|
func parseCUDA(version int) string {
|
|
major := (int)(version / 1000)
|
|
minor := (int)((version - (major * 1000)) / 10)
|
|
|
|
return fmt.Sprintf("%d.%d", major, minor)
|
|
}
|
|
|
|
func query(index int) (GPU, error) {
|
|
device, ret := nvml.DeviceGetHandleByIndex(index)
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
name, ret := device.GetName()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
uuid, ret := device.GetUUID()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
fanCount, ret := device.GetNumFans()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
fans := make([]Fan, 0, fanCount)
|
|
for i := 0; i < fanCount; i++ {
|
|
fdev, ret := device.GetFanSpeed_v2(i)
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
fan := Fan{
|
|
Speed: int(fdev),
|
|
}
|
|
fans = append(fans, fan)
|
|
}
|
|
|
|
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
load, ret := device.GetUtilizationRates()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
decUsage, _, ret := device.GetDecoderUtilization()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
encUsage, _, ret := device.GetEncoderUtilization()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
// Fetch all running process on the GPU
|
|
allProcess := make([]Process, 0)
|
|
|
|
// Compute proc
|
|
proc, ret := device.GetComputeRunningProcesses()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
for _, p := range proc {
|
|
|
|
sproc := Process{
|
|
PID: int(p.Pid),
|
|
MemoryUsed: int(p.UsedGpuMemory),
|
|
Type: "Compute",
|
|
Name: "n/a",
|
|
}
|
|
allProcess = append(allProcess, sproc)
|
|
}
|
|
|
|
// Graphics/3D procs
|
|
proc, ret = device.GetGraphicsRunningProcesses()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
for _, p := range proc {
|
|
sproc := Process{
|
|
PID: int(p.Pid),
|
|
MemoryUsed: int(p.UsedGpuMemory),
|
|
Type: "Graphics",
|
|
Name: "n/a",
|
|
}
|
|
allProcess = append(allProcess, sproc)
|
|
}
|
|
|
|
// MPS procs
|
|
proc, ret = device.GetMPSComputeRunningProcesses()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
for _, p := range proc {
|
|
sproc := Process{
|
|
PID: int(p.Pid),
|
|
MemoryUsed: int(p.UsedGpuMemory),
|
|
Type: "MPSCompute",
|
|
Name: "n/a",
|
|
}
|
|
allProcess = append(allProcess, sproc)
|
|
}
|
|
|
|
// Find process command line
|
|
for i, process := range allProcess {
|
|
if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
|
|
process.Name = strings.ReplaceAll(string(cmdline), "\u0000", " ")
|
|
allProcess[i] = process
|
|
}
|
|
}
|
|
|
|
mem, ret := device.GetMemoryInfo_v2()
|
|
if ret != nvml.SUCCESS {
|
|
return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
|
}
|
|
|
|
return GPU{
|
|
Name: name,
|
|
UUID: uuid,
|
|
Temperature: Temperature{
|
|
Current: uint(temp),
|
|
},
|
|
Utilization: Utilization{
|
|
Decoder: int(decUsage),
|
|
Encoder: int(encUsage),
|
|
Compute: int(load.Gpu),
|
|
},
|
|
Processes: allProcess,
|
|
Memory: Memory{
|
|
Free: int(mem.Free),
|
|
Reserved: int(mem.Reserved),
|
|
Total: int(mem.Total),
|
|
Used: int(mem.Used),
|
|
Version: int(mem.Version),
|
|
},
|
|
Fans: fans,
|
|
}, nil
|
|
}
|