diff --git a/api/api.go b/api/api.go index e02d818..a72ba21 100644 --- a/api/api.go +++ b/api/api.go @@ -2,6 +2,7 @@ package api import ( "encoding/json" + "errors" "fmt" "html/template" "log" @@ -20,22 +21,14 @@ import ( ) type ( - NVIDIARepository interface { - GetGPU(ID int) (nvidia.GPUDetail, error) - GetGPUs() ([]nvidia.GPU, error) - DriverVersion() string - CUDAVersion() string - } - Server struct { - repo NVIDIARepository mux *chi.Mux errLog *log.Logger } WebPack struct { - GPUs []nvidia.GPU - GPU nvidia.GPUDetail + GPUs []nvidia.GPUSummary + GPU nvidia.GPU Username string DriverVersion string CUDAVersion string @@ -63,10 +56,9 @@ const internalServerErrorPage string = ` ` -func New(repo NVIDIARepository) *Server { +func New() *Server { s := &Server{ mux: chi.NewRouter(), - repo: repo, errLog: log.New(os.Stderr, log.Prefix(), log.Flags()), } s.mux.Use(middleware.RequestID) @@ -97,11 +89,7 @@ func (s *Server) Serve(port uint) error { } func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) { - gpus, err := s.repo.GetGPUs() - if err != nil { - internalServerErrorHTML(w, err) - return - } + gpus := nvidia.Summary() if len(gpus) > 0 { w.Header().Add("Location", "/"+gpus[0].UUID) w.WriteHeader(http.StatusTemporaryRedirect) @@ -125,33 +113,32 @@ func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) { func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) { uuid := chi.URLParam(r, "uuid") - - gpus, err := s.repo.GetGPUs() + gpu, err := nvidia.GPUByUUID(uuid) + if err != nil { + if errors.Is(err, nvidia.ErrNotFound) { + w.Header().Add("Location", "/") + w.WriteHeader(http.StatusTemporaryRedirect) + return + } + internalServerErrorHTML(w, err) + return + } + driverVersion, err := nvidia.DriverVersion() if err != nil { internalServerErrorHTML(w, err) return } - i := -1 - for _, gpu := range gpus { - if gpu.UUID == uuid { - i = gpu.Index - } - } - if i == -1 { - w.WriteHeader(http.StatusNotFound) - w.Write([]byte("404 page not found")) - } - gpu, err := s.repo.GetGPU(i) + cudaVersion, err := nvidia.CUDAVersion() if err != nil { internalServerErrorHTML(w, err) return } wp := WebPack{ Username: "anonymous", - GPUs: gpus, + GPUs: nvidia.Summary(), GPU: gpu, - DriverVersion: s.repo.DriverVersion(), - CUDAVersion: s.repo.CUDAVersion(), + DriverVersion: driverVersion, + CUDAVersion: cudaVersion, Version: constant.Version, } @@ -168,24 +155,12 @@ func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) { func (s *Server) handleGPUJSON(w http.ResponseWriter, r *http.Request) { uuid := chi.URLParam(r, "uuid") - - gpus, err := s.repo.GetGPUs() + gpu, err := nvidia.GPUByUUID(uuid) if err != nil { - internalServerErrorJSON(w, err) - return - } - i := -1 - for _, gpu := range gpus { - if gpu.UUID == uuid { - i = gpu.Index + if errors.Is(err, nvidia.ErrNotFound) { + notFoundJSON(w) + return } - } - if i == -1 { - notFoundJSON(w) - return - } - gpu, err := s.repo.GetGPU(i) - if err != nil { internalServerErrorJSON(w, err) return } diff --git a/go.mod b/go.mod index ac782e5..4dce024 100644 --- a/go.mod +++ b/go.mod @@ -3,12 +3,10 @@ module nvidiadashboard go 1.22 require ( - github.com/NVIDIA/go-nvml v0.12.0-2 + github.com/NVIDIA/go-nvml v0.12.0-4 github.com/go-chi/chi/v5 v5.0.12 - golang.org/x/net v0.21.0 + github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf + golang.org/x/net v0.24.0 ) -require ( - github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf // indirect - golang.org/x/text v0.14.0 // indirect -) +require golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index dacc585..c53835a 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,5 @@ -github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY= -github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/NVIDIA/go-nvml v0.12.0-4 h1:BvPjnjJr6qje0zov57Md7TwEA8i/12kZeUQIpyWzTEE= +github.com/NVIDIA/go-nvml v0.12.0-4/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-chi/chi/v5 v5.0.12 h1:9euLV5sTrTNTRUU9POmDUvfxyj6LAABLUcEWO+JJb4s= @@ -9,18 +8,11 @@ github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf h1:FtEj8sfIcaaB github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= +golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index ba027bb..c0aef21 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "flag" "fmt" "log" "nvidiadashboard/api" @@ -9,20 +10,19 @@ import ( ) func main() { + var port int + flag.IntVar(&port, "port", 3000, "Port of the web server") + flag.Parse() + fmt.Println("*** NVIDIA Web Dashboard -", constant.Version, "***") - r := nvidia.New() - defer r.Close() + nvidia.RunDaemon() + defer nvidia.Close() - log.Println("[INFO] NVIDIA driver loaded:", r.DriverVersion()) - - gpus, _ := r.GetGPUs() - log.Printf("[INFO] %d NVIDIA GPUs found", len(gpus)) - - s := api.New(r) + s := api.New() log.Printf("[INFO] Server listening at :3000") - err := s.Serve(3000) + err := s.Serve(uint(port)) if err != nil { log.Fatal(err) } diff --git a/pkg/constant/constant.go b/pkg/constant/constant.go index 5e27c22..811c772 100644 --- a/pkg/constant/constant.go +++ b/pkg/constant/constant.go @@ -1,5 +1,5 @@ package constant const ( - Version = "0.1-alpha" + Version = "0.2-alpha" ) diff --git a/pkg/nvidia/nvidia.go b/pkg/nvidia/nvidia.go index e6442eb..bc53b42 100644 --- a/pkg/nvidia/nvidia.go +++ b/pkg/nvidia/nvidia.go @@ -3,32 +3,42 @@ package nvidia import ( "errors" "fmt" + "log" "os" "strconv" "strings" + "sync" + "time" "github.com/NVIDIA/go-nvml/pkg/nvml" ) type ( - Repository struct { - driverVersion string - cudaVersion string + cache struct { + gpus map[string]GPU + mu sync.RWMutex + } + + GPUSummary struct { + UUID string + Name string } GPU struct { - Name string `json:"name"` - UUID string `json:"uuid"` - Index int `json:"-"` + Name string `json:"name"` + UUID string `json:"uuid"` + Index int `json:"-"` + Temperature Temperature `json:"temperature"` + Utilization Utilization `json:"usage"` + Processes []Process `json:"processes"` + Memory Memory `json:"memory"` + Fans []Fan `json:"fans"` } - GPUDetail struct { - GPU - CoreTemperature int `json:"coreTemperature"` - Utilization Utilization `json:"usage"` - Processes []Process `json:"processes"` - Memory Memory `json:"memory"` - Fans []Fan `json:"fans"` + Temperature struct { + Min uint + Max uint + Current uint } Memory struct { @@ -57,9 +67,15 @@ type ( } ) -var instance *Repository +var ( + ErrInitialization = errors.New("unable to initialize NVML") + ErrDriverAPI = errors.New("an error occured while querying the driver") + ErrNotFound = errors.New("the gpu is not found") -func (*Repository) Close() error { + c *cache +) + +func Close() error { ret := nvml.Shutdown() if ret != nvml.SUCCESS { return errors.New(nvml.ErrorString(ret)) @@ -67,60 +83,129 @@ func (*Repository) Close() error { return nil } -func New() *Repository { - if instance == nil { - ret := nvml.Init() - if ret != nvml.SUCCESS { - panic("unable to initialize NVML: " + nvml.ErrorString(ret)) - } - driverVersion, ret := nvml.SystemGetDriverVersion() - if ret != nvml.SUCCESS { - panic("unable to initialize NVML: " + nvml.ErrorString(ret)) - } - cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() - if ret != nvml.SUCCESS { - panic("unable to initialize NVML: " + nvml.ErrorString(ret)) - } - instance = &Repository{ - driverVersion: driverVersion, - cudaVersion: parseCUDA(cudaVersion), - } +func RunDaemon() { + c = &cache{ + gpus: make(map[string]GPU), } - return instance + ret := nvml.Init() + if ret != nvml.SUCCESS { + log.Println("[ERROR]", nvml.ErrorString(ret)) + return + } + go func() { + for { + time.Sleep(1 * time.Second) + update() + } + }() } -func (*Repository) GetGPUs() ([]GPU, error) { +func update() { + c.mu.Lock() + defer c.mu.Unlock() count, ret := nvml.DeviceGetCount() if ret != nvml.SUCCESS { - return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret)) + log.Println("[ERROR]", nvml.ErrorString(ret)) + return } - gpus := make([]GPU, 0, count) for i := 0; i < count; i++ { - gpu, _, err := getGPU(i) + gpu, err := query(i) if err != nil { - return nil, err + log.Println("[ERROR]", err) + return } - gpus = append(gpus, gpu) + t := &gpu.Temperature + if g, ok := c.gpus[gpu.UUID]; ok { + if g.Temperature.Min > t.Current { + t.Min = t.Current + } else { + t.Min = g.Temperature.Min + } + if g.Temperature.Max < t.Current { + t.Max = t.Current + } else { + t.Max = g.Temperature.Max + } + } else { + t.Max = t.Current + t.Min = t.Current + } + c.gpus[gpu.UUID] = gpu } - return gpus, nil } -func (*Repository) GetGPU(ID int) (GPUDetail, error) { - gpu, device, err := getGPU(ID) - if err != nil { - return GPUDetail{}, err +func GPUByUUID(uuid string) (GPU, error) { + c.mu.RLock() + defer c.mu.RUnlock() + gpu, ok := c.gpus[uuid] + if !ok { + return GPU{}, fmt.Errorf("%w: %s", ErrNotFound, uuid) + } + return gpu, nil +} + +func Summary() []GPUSummary { + c.mu.RLock() + defer c.mu.RUnlock() + var res []GPUSummary + for _, gpu := range c.gpus { + res = append(res, GPUSummary{ + UUID: gpu.UUID, + Name: gpu.Name, + }) + } + return res +} + +func DriverVersion() (string, error) { + driverVersion, ret := nvml.SystemGetDriverVersion() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret)) + } + return driverVersion, nil +} + +func CUDAVersion() (string, error) { + cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("%w: %s", ErrInitialization, nvml.ErrorString(ret)) + } + return parseCUDA(cudaVersion), nil +} + +func parseCUDA(version int) string { + major := (int)(version / 1000) + minor := (int)((version - (major * 1000)) / 10) + + return fmt.Sprintf("%d.%d", major, minor) +} + +func query(index int) (GPU, error) { + device, ret := nvml.DeviceGetHandleByIndex(index) + if ret != nvml.SUCCESS { + return GPU{}, fmt.Errorf("%w: DeviceGetHandleByIndex: %s", ErrDriverAPI, nvml.ErrorString(ret)) + } + + name, ret := device.GetName() + if ret != nvml.SUCCESS { + return GPU{}, fmt.Errorf("%w: GetName: %s", ErrDriverAPI, nvml.ErrorString(ret)) + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + return GPU{}, fmt.Errorf("%w: GetUUID: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fanCount, ret := device.GetNumFans() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetNumFans: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fans := make([]Fan, 0, fanCount) for i := 0; i < fanCount; i++ { fdev, ret := device.GetFanSpeed_v2(i) if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetFanSpeed_v2: %s", ErrDriverAPI, nvml.ErrorString(ret)) } fan := Fan{ Speed: int(fdev), @@ -130,30 +215,30 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) { temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU) if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetTemperature: %s", ErrDriverAPI, nvml.ErrorString(ret)) } load, ret := device.GetUtilizationRates() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetUtilizationRates: %s", ErrDriverAPI, nvml.ErrorString(ret)) } decUsage, _, ret := device.GetDecoderUtilization() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetDecoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret)) } encUsage, _, ret := device.GetEncoderUtilization() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetEncoderUtilization: %s", ErrDriverAPI, nvml.ErrorString(ret)) } // Fetch all running process on the GPU - var allProcess []Process + allProcess := make([]Process, 0) // Compute proc proc, ret := device.GetComputeRunningProcesses() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { @@ -169,7 +254,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) { // Graphics/3D procs proc, ret = device.GetGraphicsRunningProcesses() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetGraphicsRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ @@ -184,7 +269,7 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) { // MPS procs proc, ret = device.GetMPSComputeRunningProcesses() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetMPSComputeRunningProcesses: %s", ErrDriverAPI, nvml.ErrorString(ret)) } for _, p := range proc { sproc := Process{ @@ -206,12 +291,15 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) { mem, ret := device.GetMemoryInfo_v2() if ret != nvml.SUCCESS { - return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + return GPU{}, fmt.Errorf("%w: GetMemoryInfo_v2: %s", ErrDriverAPI, nvml.ErrorString(ret)) } - return GPUDetail{ - GPU: gpu, - CoreTemperature: int(temp), + return GPU{ + Name: name, + UUID: uuid, + Temperature: Temperature{ + Current: uint(temp), + }, Utilization: Utilization{ Decoder: int(decUsage), Encoder: int(encUsage), @@ -228,43 +316,3 @@ func (*Repository) GetGPU(ID int) (GPUDetail, error) { Fans: fans, }, nil } - -func (r *Repository) DriverVersion() string { - return r.driverVersion -} - -func (r *Repository) CUDAVersion() string { - return r.cudaVersion -} - -func parseCUDA(version int) string { - major := (int)(version / 1000) - minor := (int)((version - (major * 1000)) / 10) - - return fmt.Sprintf("%d.%d", major, minor) -} - -func getGPU(ID int) (GPU, nvml.Device, error) { - device, ret := nvml.DeviceGetHandleByIndex(ID) - if ret != nvml.SUCCESS { - return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) - } - - name, ret := device.GetName() - if ret != nvml.SUCCESS { - return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) - } - - uuid, ret := device.GetUUID() - if ret != nvml.SUCCESS { - return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) - } - - gpu := GPU{ - Name: name, - UUID: uuid, - Index: ID, - } - - return gpu, device, nil -} diff --git a/static b/static index 369059c..e75644b 160000 --- a/static +++ b/static @@ -1 +1 @@ -Subproject commit 369059c925560f0d46f21cc51336eed419fb4ab7 +Subproject commit e75644b851bf43d084a3ace3e281b11326805c24