add min and max temp, refactore nvidia api
This commit is contained in:
@@ -3,32 +3,42 @@ package nvidia
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type (
|
||||
Repository struct {
|
||||
driverVersion string
|
||||
cudaVersion string
|
||||
cache struct {
|
||||
gpus map[string]GPU
|
||||
mu sync.RWMutex
|
||||
}
|
||||
|
||||
GPUSummary struct {
|
||||
UUID string
|
||||
Name string
|
||||
}
|
||||
|
||||
GPU struct {
|
||||
Name string `json:"name"`
|
||||
UUID string `json:"uuid"`
|
||||
Index int `json:"-"`
|
||||
Name string `json:"name"`
|
||||
UUID string `json:"uuid"`
|
||||
Index int `json:"-"`
|
||||
Temperature Temperature `json:"temperature"`
|
||||
Utilization Utilization `json:"usage"`
|
||||
Processes []Process `json:"processes"`
|
||||
Memory Memory `json:"memory"`
|
||||
Fans []Fan `json:"fans"`
|
||||
}
|
||||
|
||||
GPUDetail struct {
|
||||
GPU
|
||||
CoreTemperature int `json:"coreTemperature"`
|
||||
Utilization Utilization `json:"usage"`
|
||||
Processes []Process `json:"processes"`
|
||||
Memory Memory `json:"memory"`
|
||||
Fans []Fan `json:"fans"`
|
||||
Temperature struct {
|
||||
Min uint
|
||||
Max uint
|
||||
Current uint
|
||||
}
|
||||
|
||||
Memory struct {
|
||||
@@ -57,9 +67,15 @@ type (
|
||||
}
|
||||
)
|
||||
|
||||
var instance *Repository
|
||||
var (
|
||||
ErrInitialization = errors.New("unable to initialize NVML")
|
||||
ErrDriverAPI = errors.New("an error occured while querying the driver")
|
||||
ErrNotFound = errors.New("the gpu is not found")
|
||||
|
||||
func (*Repository) Close() error {
|
||||
c *cache
|
||||
)
|
||||
|
||||
func Close() error {
|
||||
ret := nvml.Shutdown()
|
||||
if ret != nvml.SUCCESS {
|
||||
return errors.New(nvml.ErrorString(ret))
|
||||
@@ -67,60 +83,129 @@ func (*Repository) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func New() *Repository {
|
||||
if instance == nil {
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
driverVersion, ret := nvml.SystemGetDriverVersion()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
||||
if ret != nvml.SUCCESS {
|
||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
||||
}
|
||||
instance = &Repository{
|
||||
driverVersion: driverVersion,
|
||||
cudaVersion: parseCUDA(cudaVersion),
|
||||
}
|
||||
func RunDaemon() {
|
||||
c = &cache{
|
||||
gpus: make(map[string]GPU),
|
||||
}
|
||||
return instance
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
log.Println("[ERROR]", nvml.ErrorString(ret))
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
for {
|
||||
time.Sleep(1 * time.Second)
|
||||
update()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (*Repository) GetGPUs() ([]GPU, error) {
|
||||
func update() {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
count, ret := nvml.DeviceGetCount()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
|
||||
log.Println("[ERROR]", nvml.ErrorString(ret))
|
||||
return
|
||||
}
|
||||
gpus := make([]GPU, 0, count)
|
||||
for i := 0; i < count; i++ {
|
||||
gpu, _, err := getGPU(i)
|
||||
gpu, err := query(i)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
log.Println("[ERROR]", err)
|
||||
return
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
t := &gpu.Temperature
|
||||
if g, ok := c.gpus[gpu.UUID]; ok {
|
||||
if g.Temperature.Min > t.Current {
|
||||
t.Min = t.Current
|
||||
} else {
|
||||
t.Min = g.Temperature.Min
|
||||
}
|
||||
if g.Temperature.Max < t.Current {
|
||||
t.Max = t.Current
|
||||
} else {
|
||||
t.Max = g.Temperature.Max
|
||||
}
|
||||
} else {
|
||||
t.Max = t.Current
|
||||
t.Min = t.Current
|
||||
}
|
||||
c.gpus[gpu.UUID] = gpu
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
gpu, device, err := getGPU(ID)
|
||||
if err != nil {
|
||||
return GPUDetail{}, err
|
||||
func GPUByUUID(uuid string) (GPU, error) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
gpu, ok := c.gpus[uuid]
|
||||
if !ok {
|
||||
return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
|
||||
}
|
||||
return gpu, nil
|
||||
}
|
||||
|
||||
func Summary() []GPUSummary {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
var res []GPUSummary
|
||||
for _, gpu := range c.gpus {
|
||||
res = append(res, GPUSummary{
|
||||
UUID: gpu.UUID,
|
||||
Name: gpu.Name,
|
||||
})
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func DriverVersion() (string, error) {
|
||||
driverVersion, ret := nvml.SystemGetDriverVersion()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
||||
}
|
||||
return driverVersion, nil
|
||||
}
|
||||
|
||||
func CUDAVersion() (string, error) {
|
||||
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
||||
}
|
||||
return parseCUDA(cudaVersion), nil
|
||||
}
|
||||
|
||||
func parseCUDA(version int) string {
|
||||
major := (int)(version / 1000)
|
||||
minor := (int)((version - (major * 1000)) / 10)
|
||||
|
||||
return fmt.Sprintf("%d.%d", major, minor)
|
||||
}
|
||||
|
||||
func query(index int) (GPU, error) {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(index)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
name, ret := device.GetName()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
uuid, ret := device.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
fanCount, ret := device.GetNumFans()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
fans := make([]Fan, 0, fanCount)
|
||||
for i := 0; i < fanCount; i++ {
|
||||
fdev, ret := device.GetFanSpeed_v2(i)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
fan := Fan{
|
||||
Speed: int(fdev),
|
||||
@@ -130,30 +215,30 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
|
||||
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
load, ret := device.GetUtilizationRates()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
decUsage, _, ret := device.GetDecoderUtilization()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
encUsage, _, ret := device.GetEncoderUtilization()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
// Fetch all running process on the GPU
|
||||
var allProcess []Process
|
||||
allProcess := make([]Process, 0)
|
||||
|
||||
// Compute proc
|
||||
proc, ret := device.GetComputeRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
|
||||
@@ -169,7 +254,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
// Graphics/3D procs
|
||||
proc, ret = device.GetGraphicsRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
sproc := Process{
|
||||
@@ -184,7 +269,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
// MPS procs
|
||||
proc, ret = device.GetMPSComputeRunningProcesses()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
for _, p := range proc {
|
||||
sproc := Process{
|
||||
@@ -206,12 +291,15 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
|
||||
mem, ret := device.GetMemoryInfo_v2()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
||||
return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
return GPUDetail{
|
||||
GPU: gpu,
|
||||
CoreTemperature: int(temp),
|
||||
return GPU{
|
||||
Name: name,
|
||||
UUID: uuid,
|
||||
Temperature: Temperature{
|
||||
Current: uint(temp),
|
||||
},
|
||||
Utilization: Utilization{
|
||||
Decoder: int(decUsage),
|
||||
Encoder: int(encUsage),
|
||||
@@ -228,43 +316,3 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
||||
Fans: fans,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (r *Repository) DriverVersion() string {
|
||||
return r.driverVersion
|
||||
}
|
||||
|
||||
func (r *Repository) CUDAVersion() string {
|
||||
return r.cudaVersion
|
||||
}
|
||||
|
||||
func parseCUDA(version int) string {
|
||||
major := (int)(version / 1000)
|
||||
minor := (int)((version - (major * 1000)) / 10)
|
||||
|
||||
return fmt.Sprintf("%d.%d", major, minor)
|
||||
}
|
||||
|
||||
func getGPU(ID int) (GPU, nvml.Device, error) {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(ID)
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
name, ret := device.GetName()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
uuid, ret := device.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
||||
}
|
||||
|
||||
gpu := GPU{
|
||||
Name: name,
|
||||
UUID: uuid,
|
||||
Index: ID,
|
||||
}
|
||||
|
||||
return gpu, device, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user