Start project
This commit is contained in:
268
pkg/nvidia/nvidia.go
Normal file
268
pkg/nvidia/nvidia.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type (
|
||||
Repository struct {
|
||||
driverVersion string
|
||||
cudaVersion string
|
||||
}
|
||||
|
||||
GPU struct {
|
||||
Name string
|
||||
UUID string
|
||||
Index int
|
||||
}
|
||||
|
||||
GPUDetail struct {
|
||||
GPU
|
||||
CoreTemperature int
|
||||
Utilization Utilization
|
||||
Processes []Process
|
||||
Memory Memory
|
||||
Fans []Fan
|
||||
}
|
||||
|
||||
Memory struct {
|
||||
Free int
|
||||
Reserved int
|
||||
Total int
|
||||
Used int
|
||||
Version int
|
||||
}
|
||||
|
||||
Process struct {
|
||||
PID int
|
||||
MemoryUsed int
|
||||
Name string
|
||||
Type string
|
||||
}
|
||||
|
||||
Utilization struct {
|
||||
Decode int
|
||||
Encode int
|
||||
Rate int
|
||||
}
|
||||
|
||||
Fan struct {
|
||||
Speed int
|
||||
}
|
||||
)
|
||||
|
||||
var instance *Repository
|
||||
|
||||
func (*Repository) Close() error {
|
||||
ret := nvml.Shutdown()
|
||||
if ret != nvml.SUCCESS {
|
||||
return errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func New() *Repository {
|
||||
if instance == nil {
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
driverVersion, ret := nvml.SystemGetDriverVersion()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
instance = &Repository{
|
||||
driverVersion: driverVersion,
|
||||
cudaVersion: parseCUDA(cudaVersion),
|
||||
}
|
||||
}
|
||||
return instance
|
||||
}
|
||||
|
||||
func (*Repository) GetGPUs() ([]GPU, error) {
|
||||
count, ret := nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
|
||||
}
|
||||
gpus := make([]GPU, 0, count)
|
||||
for i := 0; i < count; i++ {
|
||||
gpu, _, err := getGPU(i)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
gpu, device, err := getGPU(ID)
|
||||
if err != nil {
|
||||
return GPUDetail{}, err
|
||||
}
|
||||
|
||||
fanCount, ret := device.GetNumFans()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
fans := make([]Fan, 0, fanCount)
|
||||
for i := 0; i < fanCount; i++ {
|
||||
fdev, ret := device.GetFanSpeed_v2(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
fan := Fan{
|
||||
Speed: int(fdev),
|
||||
}
|
||||
fans = append(fans, fan)
|
||||
}
|
||||
|
||||
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
load, ret := device.GetUtilizationRates()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
decUsage, _, ret := device.GetDecoderUtilization()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
encUsage, _, ret := device.GetEncoderUtilization()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
// Fetch all running process on the GPU
|
||||
var allProcess []Process
|
||||
|
||||
// Compute proc
|
||||
proc, ret := device.GetComputeRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
|
||||
sproc := Process{
|
||||
PID: int(p.Pid),
|
||||
MemoryUsed: int(p.UsedGpuMemory),
|
||||
Type: "Compute",
|
||||
Name: "n/a",
|
||||
}
|
||||
allProcess = append(allProcess, sproc)
|
||||
}
|
||||
|
||||
// Graphics/3D procs
|
||||
proc, ret = device.GetGraphicsRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
sproc := Process{
|
||||
PID: int(p.Pid),
|
||||
MemoryUsed: int(p.UsedGpuMemory),
|
||||
Type: "Graphics",
|
||||
Name: "n/a",
|
||||
}
|
||||
allProcess = append(allProcess, sproc)
|
||||
}
|
||||
|
||||
// MPS procs
|
||||
proc, ret = device.GetMPSComputeRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
sproc := Process{
|
||||
PID: int(p.Pid),
|
||||
MemoryUsed: int(p.UsedGpuMemory),
|
||||
Type: "MPSCompute",
|
||||
Name: "n/a",
|
||||
}
|
||||
allProcess = append(allProcess, sproc)
|
||||
}
|
||||
|
||||
// Find process command line
|
||||
for _, process := range allProcess {
|
||||
if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
|
||||
process.Name = string(cmdline)
|
||||
}
|
||||
}
|
||||
|
||||
mem, ret := device.GetMemoryInfo_v2()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
return GPUDetail{
|
||||
GPU: gpu,
|
||||
CoreTemperature: int(temp),
|
||||
Utilization: Utilization{
|
||||
Decode: int(decUsage),
|
||||
Encode: int(encUsage),
|
||||
Rate: int(load.Gpu),
|
||||
},
|
||||
Processes: allProcess,
|
||||
Memory: Memory{
|
||||
Free: int(mem.Free),
|
||||
Reserved: int(mem.Reserved),
|
||||
Total: int(mem.Total),
|
||||
Used: int(mem.Used),
|
||||
Version: int(mem.Version),
|
||||
},
|
||||
Fans: fans,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (r *Repository) DriverVersion() string {
|
||||
return r.driverVersion
|
||||
}
|
||||
|
||||
func (r *Repository) CUDAVersion() string {
|
||||
return r.cudaVersion
|
||||
}
|
||||
|
||||
func parseCUDA(version int) string {
|
||||
major := (int)(version / 1000)
|
||||
minor := (int)((version - (major * 1000)) / 10)
|
||||
|
||||
return fmt.Sprintf("%d.%d", major, minor)
|
||||
}
|
||||
|
||||
func getGPU(ID int) (GPU, nvml.Device, error) {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(ID)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
name, ret := device.GetName()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
uuid, ret := device.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
gpu := GPU{
|
||||
Name: name,
|
||||
UUID: uuid,
|
||||
Index: ID,
|
||||
}
|
||||
|
||||
return gpu, device, nil
|
||||
}
|
||||
Reference in New Issue
Block a user