add min and max temp, refactore nvidia api
This commit is contained in:
69
api/api.go
69
api/api.go
@@ -2,6 +2,7 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html/template"
|
"html/template"
|
||||||
"log"
|
"log"
|
||||||
@@ -20,22 +21,14 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type (
|
type (
|
||||||
NVIDIARepository interface {
|
|
||||||
GetGPU(ID int) (nvidia.GPUDetail, error)
|
|
||||||
GetGPUs() ([]nvidia.GPU, error)
|
|
||||||
DriverVersion() string
|
|
||||||
CUDAVersion() string
|
|
||||||
}
|
|
||||||
|
|
||||||
Server struct {
|
Server struct {
|
||||||
repo NVIDIARepository
|
|
||||||
mux *chi.Mux
|
mux *chi.Mux
|
||||||
errLog *log.Logger
|
errLog *log.Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
WebPack struct {
|
WebPack struct {
|
||||||
GPUs []nvidia.GPU
|
GPUs []nvidia.GPUSummary
|
||||||
GPU nvidia.GPUDetail
|
GPU nvidia.GPU
|
||||||
Username string
|
Username string
|
||||||
DriverVersion string
|
DriverVersion string
|
||||||
CUDAVersion string
|
CUDAVersion string
|
||||||
@@ -63,10 +56,9 @@ const internalServerErrorPage string = `<!DOCTYPE html>
|
|||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|
||||||
func New(repo NVIDIARepository) *Server {
|
func New() *Server {
|
||||||
s := &Server{
|
s := &Server{
|
||||||
mux: chi.NewRouter(),
|
mux: chi.NewRouter(),
|
||||||
repo: repo,
|
|
||||||
errLog: log.New(os.Stderr, log.Prefix(), log.Flags()),
|
errLog: log.New(os.Stderr, log.Prefix(), log.Flags()),
|
||||||
}
|
}
|
||||||
s.mux.Use(middleware.RequestID)
|
s.mux.Use(middleware.RequestID)
|
||||||
@@ -97,11 +89,7 @@ func (s *Server) Serve(port uint) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
|
func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
|
||||||
gpus, err := s.repo.GetGPUs()
|
gpus := nvidia.Summary()
|
||||||
if err != nil {
|
|
||||||
internalServerErrorHTML(w, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if len(gpus) > 0 {
|
if len(gpus) > 0 {
|
||||||
w.Header().Add("Location", "/"+gpus[0].UUID)
|
w.Header().Add("Location", "/"+gpus[0].UUID)
|
||||||
w.WriteHeader(http.StatusTemporaryRedirect)
|
w.WriteHeader(http.StatusTemporaryRedirect)
|
||||||
@@ -125,33 +113,32 @@ func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
|
|||||||
|
|
||||||
func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) {
|
||||||
uuid := chi.URLParam(r, "uuid")
|
uuid := chi.URLParam(r, "uuid")
|
||||||
|
gpu, err := nvidia.GPUByUUID(uuid)
|
||||||
gpus, err := s.repo.GetGPUs()
|
if err != nil {
|
||||||
|
if errors.Is(err, nvidia.ErrNotFound) {
|
||||||
|
w.Header().Add("Location", "/")
|
||||||
|
w.WriteHeader(http.StatusTemporaryRedirect)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
internalServerErrorHTML(w, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
driverVersion, err := nvidia.DriverVersion()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internalServerErrorHTML(w, err)
|
internalServerErrorHTML(w, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
i := -1
|
cudaVersion, err := nvidia.CUDAVersion()
|
||||||
for _, gpu := range gpus {
|
|
||||||
if gpu.UUID == uuid {
|
|
||||||
i = gpu.Index
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if i == -1 {
|
|
||||||
w.WriteHeader(http.StatusNotFound)
|
|
||||||
w.Write([]byte("404 page not found"))
|
|
||||||
}
|
|
||||||
gpu, err := s.repo.GetGPU(i)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internalServerErrorHTML(w, err)
|
internalServerErrorHTML(w, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
wp := WebPack{
|
wp := WebPack{
|
||||||
Username: "anonymous",
|
Username: "anonymous",
|
||||||
GPUs: gpus,
|
GPUs: nvidia.Summary(),
|
||||||
GPU: gpu,
|
GPU: gpu,
|
||||||
DriverVersion: s.repo.DriverVersion(),
|
DriverVersion: driverVersion,
|
||||||
CUDAVersion: s.repo.CUDAVersion(),
|
CUDAVersion: cudaVersion,
|
||||||
Version: constant.Version,
|
Version: constant.Version,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -168,24 +155,12 @@ func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
func (s *Server) handleGPUJSON(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleGPUJSON(w http.ResponseWriter, r *http.Request) {
|
||||||
uuid := chi.URLParam(r, "uuid")
|
uuid := chi.URLParam(r, "uuid")
|
||||||
|
gpu, err := nvidia.GPUByUUID(uuid)
|
||||||
gpus, err := s.repo.GetGPUs()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
internalServerErrorJSON(w, err)
|
if errors.Is(err, nvidia.ErrNotFound) {
|
||||||
return
|
|
||||||
}
|
|
||||||
i := -1
|
|
||||||
for _, gpu := range gpus {
|
|
||||||
if gpu.UUID == uuid {
|
|
||||||
i = gpu.Index
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if i == -1 {
|
|
||||||
notFoundJSON(w)
|
notFoundJSON(w)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
gpu, err := s.repo.GetGPU(i)
|
|
||||||
if err != nil {
|
|
||||||
internalServerErrorJSON(w, err)
|
internalServerErrorJSON(w, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
10
go.mod
10
go.mod
@@ -3,12 +3,10 @@ module nvidiadashboard
|
|||||||
go 1.22
|
go 1.22
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/NVIDIA/go-nvml v0.12.0-2
|
github.com/NVIDIA/go-nvml v0.12.0-4
|
||||||
github.com/go-chi/chi/v5 v5.0.12
|
github.com/go-chi/chi/v5 v5.0.12
|
||||||
golang.org/x/net v0.21.0
|
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf
|
||||||
|
golang.org/x/net v0.24.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require golang.org/x/text v0.14.0 // indirect
|
||||||
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf // indirect
|
|
||||||
golang.org/x/text v0.14.0 // indirect
|
|
||||||
)
|
|
||||||
|
|||||||
20
go.sum
20
go.sum
@@ -1,6 +1,5 @@
|
|||||||
github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY=
|
github.com/NVIDIA/go-nvml v0.12.0-4 h1:BvPjnjJr6qje0zov57Md7TwEA8i/12kZeUQIpyWzTEE=
|
||||||
github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
|
github.com/NVIDIA/go-nvml v0.12.0-4/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/go-chi/chi/v5 v5.0.12 h1:9euLV5sTrTNTRUU9POmDUvfxyj6LAABLUcEWO+JJb4s=
|
github.com/go-chi/chi/v5 v5.0.12 h1:9euLV5sTrTNTRUU9POmDUvfxyj6LAABLUcEWO+JJb4s=
|
||||||
@@ -9,18 +8,11 @@ github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf h1:FtEj8sfIcaaB
|
|||||||
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4=
|
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
|
||||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
|
||||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
|
||||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
|
||||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
|
||||||
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
|
|
||||||
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
|
|
||||||
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
|
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
|
||||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
|
||||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
18
main.go
18
main.go
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"nvidiadashboard/api"
|
"nvidiadashboard/api"
|
||||||
@@ -9,20 +10,19 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
var port int
|
||||||
|
flag.IntVar(&port, "port", 3000, "Port of the web server")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
fmt.Println("*** NVIDIA Web Dashboard -", constant.Version, "***")
|
fmt.Println("*** NVIDIA Web Dashboard -", constant.Version, "***")
|
||||||
|
|
||||||
r := nvidia.New()
|
nvidia.RunDaemon()
|
||||||
defer r.Close()
|
defer nvidia.Close()
|
||||||
|
|
||||||
log.Println("[INFO] NVIDIA driver loaded:", r.DriverVersion())
|
s := api.New()
|
||||||
|
|
||||||
gpus, _ := r.GetGPUs()
|
|
||||||
log.Printf("[INFO] %d NVIDIA GPUs found", len(gpus))
|
|
||||||
|
|
||||||
s := api.New(r)
|
|
||||||
|
|
||||||
log.Printf("[INFO] Server listening at :3000")
|
log.Printf("[INFO] Server listening at :3000")
|
||||||
err := s.Serve(3000)
|
err := s.Serve(uint(port))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
package constant
|
package constant
|
||||||
|
|
||||||
const (
|
const (
|
||||||
Version = "0.1-alpha"
|
Version = "0.2-alpha"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,34 +3,44 @@ package nvidia
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type (
|
type (
|
||||||
Repository struct {
|
cache struct {
|
||||||
driverVersion string
|
gpus map[string]GPU
|
||||||
cudaVersion string
|
mu sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
GPUSummary struct {
|
||||||
|
UUID string
|
||||||
|
Name string
|
||||||
}
|
}
|
||||||
|
|
||||||
GPU struct {
|
GPU struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
UUID string `json:"uuid"`
|
UUID string `json:"uuid"`
|
||||||
Index int `json:"-"`
|
Index int `json:"-"`
|
||||||
}
|
Temperature Temperature `json:"temperature"`
|
||||||
|
|
||||||
GPUDetail struct {
|
|
||||||
GPU
|
|
||||||
CoreTemperature int `json:"coreTemperature"`
|
|
||||||
Utilization Utilization `json:"usage"`
|
Utilization Utilization `json:"usage"`
|
||||||
Processes []Process `json:"processes"`
|
Processes []Process `json:"processes"`
|
||||||
Memory Memory `json:"memory"`
|
Memory Memory `json:"memory"`
|
||||||
Fans []Fan `json:"fans"`
|
Fans []Fan `json:"fans"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Temperature struct {
|
||||||
|
Min uint
|
||||||
|
Max uint
|
||||||
|
Current uint
|
||||||
|
}
|
||||||
|
|
||||||
Memory struct {
|
Memory struct {
|
||||||
Free int `json:"free"`
|
Free int `json:"free"`
|
||||||
Reserved int `json:"reserved"`
|
Reserved int `json:"reserved"`
|
||||||
@@ -57,9 +67,15 @@ type (
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
var instance *Repository
|
var (
|
||||||
|
ErrInitialization = errors.New("unable to initialize NVML")
|
||||||
|
ErrDriverAPI = errors.New("an error occured while querying the driver")
|
||||||
|
ErrNotFound = errors.New("the gpu is not found")
|
||||||
|
|
||||||
func (*Repository) Close() error {
|
c *cache
|
||||||
|
)
|
||||||
|
|
||||||
|
func Close() error {
|
||||||
ret := nvml.Shutdown()
|
ret := nvml.Shutdown()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return errors.New(nvml.ErrorString(ret))
|
return errors.New(nvml.ErrorString(ret))
|
||||||
@@ -67,60 +83,129 @@ func (*Repository) Close() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *Repository {
|
func RunDaemon() {
|
||||||
if instance == nil {
|
c = &cache{
|
||||||
|
gpus: make(map[string]GPU),
|
||||||
|
}
|
||||||
ret := nvml.Init()
|
ret := nvml.Init()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
log.Println("[ERROR]", nvml.ErrorString(ret))
|
||||||
|
return
|
||||||
}
|
}
|
||||||
driverVersion, ret := nvml.SystemGetDriverVersion()
|
go func() {
|
||||||
if ret != nvml.SUCCESS {
|
for {
|
||||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
time.Sleep(1 * time.Second)
|
||||||
|
update()
|
||||||
}
|
}
|
||||||
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
}()
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
|
|
||||||
}
|
|
||||||
instance = &Repository{
|
|
||||||
driverVersion: driverVersion,
|
|
||||||
cudaVersion: parseCUDA(cudaVersion),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return instance
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*Repository) GetGPUs() ([]GPU, error) {
|
func update() {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
count, ret := nvml.DeviceGetCount()
|
count, ret := nvml.DeviceGetCount()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
|
log.Println("[ERROR]", nvml.ErrorString(ret))
|
||||||
|
return
|
||||||
}
|
}
|
||||||
gpus := make([]GPU, 0, count)
|
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
gpu, _, err := getGPU(i)
|
gpu, err := query(i)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
log.Println("[ERROR]", err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
gpus = append(gpus, gpu)
|
t := &gpu.Temperature
|
||||||
|
if g, ok := c.gpus[gpu.UUID]; ok {
|
||||||
|
if g.Temperature.Min > t.Current {
|
||||||
|
t.Min = t.Current
|
||||||
|
} else {
|
||||||
|
t.Min = g.Temperature.Min
|
||||||
|
}
|
||||||
|
if g.Temperature.Max < t.Current {
|
||||||
|
t.Max = t.Current
|
||||||
|
} else {
|
||||||
|
t.Max = g.Temperature.Max
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
t.Max = t.Current
|
||||||
|
t.Min = t.Current
|
||||||
|
}
|
||||||
|
c.gpus[gpu.UUID] = gpu
|
||||||
}
|
}
|
||||||
return gpus, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
func GPUByUUID(uuid string) (GPU, error) {
|
||||||
gpu, device, err := getGPU(ID)
|
c.mu.RLock()
|
||||||
if err != nil {
|
defer c.mu.RUnlock()
|
||||||
return GPUDetail{}, err
|
gpu, ok := c.gpus[uuid]
|
||||||
|
if !ok {
|
||||||
|
return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid)
|
||||||
|
}
|
||||||
|
return gpu, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func Summary() []GPUSummary {
|
||||||
|
c.mu.RLock()
|
||||||
|
defer c.mu.RUnlock()
|
||||||
|
var res []GPUSummary
|
||||||
|
for _, gpu := range c.gpus {
|
||||||
|
res = append(res, GPUSummary{
|
||||||
|
UUID: gpu.UUID,
|
||||||
|
Name: gpu.Name,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func DriverVersion() (string, error) {
|
||||||
|
driverVersion, ret := nvml.SystemGetDriverVersion()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
||||||
|
}
|
||||||
|
return driverVersion, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func CUDAVersion() (string, error) {
|
||||||
|
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret))
|
||||||
|
}
|
||||||
|
return parseCUDA(cudaVersion), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseCUDA(version int) string {
|
||||||
|
major := (int)(version / 1000)
|
||||||
|
minor := (int)((version - (major * 1000)) / 10)
|
||||||
|
|
||||||
|
return fmt.Sprintf("%d.%d", major, minor)
|
||||||
|
}
|
||||||
|
|
||||||
|
func query(index int) (GPU, error) {
|
||||||
|
device, ret := nvml.DeviceGetHandleByIndex(index)
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
|
}
|
||||||
|
|
||||||
|
name, ret := device.GetName()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
|
}
|
||||||
|
|
||||||
|
uuid, ret := device.GetUUID()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
fanCount, ret := device.GetNumFans()
|
fanCount, ret := device.GetNumFans()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
fans := make([]Fan, 0, fanCount)
|
fans := make([]Fan, 0, fanCount)
|
||||||
for i := 0; i < fanCount; i++ {
|
for i := 0; i < fanCount; i++ {
|
||||||
fdev, ret := device.GetFanSpeed_v2(i)
|
fdev, ret := device.GetFanSpeed_v2(i)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
fan := Fan{
|
fan := Fan{
|
||||||
Speed: int(fdev),
|
Speed: int(fdev),
|
||||||
@@ -130,30 +215,30 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
|||||||
|
|
||||||
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
|
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
load, ret := device.GetUtilizationRates()
|
load, ret := device.GetUtilizationRates()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
decUsage, _, ret := device.GetDecoderUtilization()
|
decUsage, _, ret := device.GetDecoderUtilization()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
encUsage, _, ret := device.GetEncoderUtilization()
|
encUsage, _, ret := device.GetEncoderUtilization()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fetch all running process on the GPU
|
// Fetch all running process on the GPU
|
||||||
var allProcess []Process
|
allProcess := make([]Process, 0)
|
||||||
|
|
||||||
// Compute proc
|
// Compute proc
|
||||||
proc, ret := device.GetComputeRunningProcesses()
|
proc, ret := device.GetComputeRunningProcesses()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
for _, p := range proc {
|
for _, p := range proc {
|
||||||
|
|
||||||
@@ -169,7 +254,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
|||||||
// Graphics/3D procs
|
// Graphics/3D procs
|
||||||
proc, ret = device.GetGraphicsRunningProcesses()
|
proc, ret = device.GetGraphicsRunningProcesses()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
for _, p := range proc {
|
for _, p := range proc {
|
||||||
sproc := Process{
|
sproc := Process{
|
||||||
@@ -184,7 +269,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
|||||||
// MPS procs
|
// MPS procs
|
||||||
proc, ret = device.GetMPSComputeRunningProcesses()
|
proc, ret = device.GetMPSComputeRunningProcesses()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
for _, p := range proc {
|
for _, p := range proc {
|
||||||
sproc := Process{
|
sproc := Process{
|
||||||
@@ -206,12 +291,15 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
|||||||
|
|
||||||
mem, ret := device.GetMemoryInfo_v2()
|
mem, ret := device.GetMemoryInfo_v2()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
|
return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
return GPUDetail{
|
return GPU{
|
||||||
GPU: gpu,
|
Name: name,
|
||||||
CoreTemperature: int(temp),
|
UUID: uuid,
|
||||||
|
Temperature: Temperature{
|
||||||
|
Current: uint(temp),
|
||||||
|
},
|
||||||
Utilization: Utilization{
|
Utilization: Utilization{
|
||||||
Decoder: int(decUsage),
|
Decoder: int(decUsage),
|
||||||
Encoder: int(encUsage),
|
Encoder: int(encUsage),
|
||||||
@@ -228,43 +316,3 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) {
|
|||||||
Fans: fans,
|
Fans: fans,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Repository) DriverVersion() string {
|
|
||||||
return r.driverVersion
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Repository) CUDAVersion() string {
|
|
||||||
return r.cudaVersion
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseCUDA(version int) string {
|
|
||||||
major := (int)(version / 1000)
|
|
||||||
minor := (int)((version - (major * 1000)) / 10)
|
|
||||||
|
|
||||||
return fmt.Sprintf("%d.%d", major, minor)
|
|
||||||
}
|
|
||||||
|
|
||||||
func getGPU(ID int) (GPU, nvml.Device, error) {
|
|
||||||
device, ret := nvml.DeviceGetHandleByIndex(ID)
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
|
||||||
}
|
|
||||||
|
|
||||||
name, ret := device.GetName()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
|
||||||
}
|
|
||||||
|
|
||||||
uuid, ret := device.GetUUID()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
|
|
||||||
}
|
|
||||||
|
|
||||||
gpu := GPU{
|
|
||||||
Name: name,
|
|
||||||
UUID: uuid,
|
|
||||||
Index: ID,
|
|
||||||
}
|
|
||||||
|
|
||||||
return gpu, device, nil
|
|
||||||
}
|
|
||||||
|
|||||||
2
static
2
static
Submodule static updated: 369059c925...e75644b851
Reference in New Issue
Block a user