Files
nvidia-web-dashboard/pkg/nvidia/nvidia.go
2024-02-23 20:24:27 +01:00

269 lines
5.5 KiB
Go

package nvidia
import (
"errors"
"fmt"
"os"
"strconv"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type (
Repository struct {
driverVersion string
cudaVersion string
}
GPU struct {
Name string
UUID string
Index int
}
GPUDetail struct {
GPU
CoreTemperature int
Utilization Utilization
Processes []Process
Memory Memory
Fans []Fan
}
Memory struct {
Free int
Reserved int
Total int
Used int
Version int
}
Process struct {
PID int
MemoryUsed int
Name string
Type string
}
Utilization struct {
Decode int
Encode int
Rate int
}
Fan struct {
Speed int
}
)
var instance *Repository
func (*Repository) Close() error {
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}
func New() *Repository {
if instance == nil {
ret := nvml.Init()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
instance = &Repository{
driverVersion: driverVersion,
cudaVersion: parseCUDA(cudaVersion),
}
}
return instance
}
func (*Repository) GetGPUs() ([]GPU, error) {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
}
gpus := make([]GPU, 0, count)
for i := 0; i < count; i++ {
gpu, _, err := getGPU(i)
if err != nil {
return nil, err
}
gpus = append(gpus, gpu)
}
return gpus, nil
}
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
gpu, device, err := getGPU(ID)
if err != nil {
return GPUDetail{}, err
}
fanCount, ret := device.GetNumFans()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
fans := make([]Fan, 0, fanCount)
for i := 0; i < fanCount; i++ {
fdev, ret := device.GetFanSpeed_v2(i)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
fan := Fan{
Speed: int(fdev),
}
fans = append(fans, fan)
}
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
load, ret := device.GetUtilizationRates()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
decUsage, _, ret := device.GetDecoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
encUsage, _, ret := device.GetEncoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
// Fetch all running process on the GPU
var allProcess []Process
// Compute proc
proc, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Compute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Graphics/3D procs
proc, ret = device.GetGraphicsRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Graphics",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// MPS procs
proc, ret = device.GetMPSComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "MPSCompute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Find process command line
for _, process := range allProcess {
if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
process.Name = string(cmdline)
}
}
mem, ret := device.GetMemoryInfo_v2()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
return GPUDetail{
GPU: gpu,
CoreTemperature: int(temp),
Utilization: Utilization{
Decode: int(decUsage),
Encode: int(encUsage),
Rate: int(load.Gpu),
},
Processes: allProcess,
Memory: Memory{
Free: int(mem.Free),
Reserved: int(mem.Reserved),
Total: int(mem.Total),
Used: int(mem.Used),
Version: int(mem.Version),
},
Fans: fans,
}, nil
}
func (r *Repository) DriverVersion() string {
return r.driverVersion
}
func (r *Repository) CUDAVersion() string {
return r.cudaVersion
}
func parseCUDA(version int) string {
major := (int)(version / 1000)
minor := (int)((version - (major * 1000)) / 10)
return fmt.Sprintf("%d.%d", major, minor)
}
func getGPU(ID int) (GPU, nvml.Device, error) {
device, ret := nvml.DeviceGetHandleByIndex(ID)
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
gpu := GPU{
Name: name,
UUID: uuid,
Index: ID,
}
return gpu, device, nil
}