commit de91d6f3e822736f143750372fbfa53b911ce5a2 Author: Aurélie Date: Fri Feb 23 20:24:27 2024 +0100 Start project diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..020bfcd --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "static"] + path = static + url = https://git.thelilfrog.com/thelilfrog/startbootstrap-sb-admin-2 diff --git a/api/api.go b/api/api.go new file mode 100644 index 0000000..10531f8 --- /dev/null +++ b/api/api.go @@ -0,0 +1,196 @@ +package api + +import ( + "fmt" + "html/template" + "log" + "net/http" + "nvidiadashboard/pkg/nvidia" + "os" + "path/filepath" + "strings" + + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + "github.com/inhies/go-bytesize" + "golang.org/x/net/http2" + "golang.org/x/net/http2/h2c" +) + +type ( + NVIDIARepository interface { + GetGPU(ID int) (nvidia.GPUDetail, error) + GetGPUs() ([]nvidia.GPU, error) + DriverVersion() string + CUDAVersion() string + } + + Server struct { + repo NVIDIARepository + mux *chi.Mux + errLog *log.Logger + } + + WebPack struct { + GPUs []nvidia.GPU + GPU nvidia.GPUDetail + Username string + DriverVersion string + CUDAVersion string + } +) + +var ( + templateFuncMap = template.FuncMap{ + "ConvertByteSize": convertByteSize, + "PercentageRounded": percentageRounded, + } +) + +const internalServerErrorPage string = ` + + + Internal Server Error + + +

Internal Server Error

+ nvidia-smi dashboard + +` + +func New(repo NVIDIARepository) *Server { + s := &Server{ + mux: chi.NewRouter(), + repo: repo, + errLog: log.New(os.Stderr, log.Prefix(), log.Flags()), + } + s.mux.Use(middleware.RequestID) + s.mux.Use(middleware.Logger) + s.mux.Use(middleware.Recoverer) + s.mux.Use(middleware.URLFormat) + + s.mux.Get("/", s.handleRoot) + s.mux.Get("/{uuid:GPU-(.*)}", s.handleGPU) + s.mux.Get("/{path:(.*).html}", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + w.Write([]byte("404 page not found")) + }) + workDir, _ := os.Getwd() + filesDir := http.Dir(filepath.Join(workDir, "static")) + fileServer(s.mux, "/", filesDir) + return s +} + +func (s *Server) Serve(port uint) error { + h2s := &http2.Server{} + srv := &http.Server{ + Addr: fmt.Sprintf("0.0.0.0:%v", port), + Handler: h2c.NewHandler(s.mux, h2s), // HTTP/2 Cleartext handler + } + return srv.ListenAndServe() +} + +func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) { + gpus, err := s.repo.GetGPUs() + if err != nil { + sendInternalServerErrorHTML(w, err) + return + } + if len(gpus) > 0 { + w.Header().Add("Location", "/"+gpus[0].UUID) + w.WriteHeader(http.StatusTemporaryRedirect) + return + } + wp := WebPack{ + Username: "anonymous", + } + + t, err := template.ParseFiles("static/no_gpu.html") + if err != nil { + sendInternalServerErrorHTML(w, err) + } + err = t.Execute(w, wp) + if err != nil { + s.errLog.Println("[ERROR]", err) + } +} + +func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) { + uuid := chi.URLParam(r, "uuid") + + gpus, err := s.repo.GetGPUs() + if err != nil { + sendInternalServerErrorHTML(w, err) + return + } + i := -1 + for _, gpu := range gpus { + if gpu.UUID == uuid { + i = gpu.Index + } + } + if i == -1 { + w.WriteHeader(http.StatusNotFound) + w.Write([]byte("404 page not found")) + } + gpu, err := s.repo.GetGPU(i) + if err != nil { + sendInternalServerErrorHTML(w, err) + return + } + wp := WebPack{ + Username: "anonymous", + GPUs: gpus, + GPU: gpu, + DriverVersion: s.repo.DriverVersion(), + CUDAVersion: s.repo.CUDAVersion(), + } + + t, err := template.New("index").Funcs(templateFuncMap).ParseFiles("static/index.html") + if err != nil { + sendInternalServerErrorHTML(w, err) + } + err = t.ExecuteTemplate(w, "index.html", wp) + if err != nil { + s.errLog.Println("[ERROR]", err) + } +} + +// FileServer conveniently sets up a http.FileServer handler to serve +// static files from a http.FileSystem. +func fileServer(r chi.Router, path string, root http.FileSystem) { + if strings.ContainsAny(path, "{}*") { + panic("FileServer does not permit any URL parameters.") + } + + if path != "/" && path[len(path)-1] != '/' { + r.Get(path, http.RedirectHandler(path+"/", http.StatusMovedPermanently).ServeHTTP) + path += "/" + } + path += "*" + + r.Get(path, func(w http.ResponseWriter, r *http.Request) { + rctx := chi.RouteContext(r.Context()) + pathPrefix := strings.TrimSuffix(rctx.RoutePattern(), "/*") + fs := http.StripPrefix(pathPrefix, http.FileServer(root)) + fs.ServeHTTP(w, r) + }) +} + +func sendInternalServerErrorHTML(w http.ResponseWriter, v any) { + w.WriteHeader(http.StatusInternalServerError) + body := fmt.Sprintf(internalServerErrorPage, v) + w.Write([]byte(body)) +} + +func convertByteSize(v int) string { + bs := bytesize.New(float64(v)) + return bs.String() +} + +func percentageRounded(a, b int) int { + p := (a / b) * 100 + return p +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..ac782e5 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module nvidiadashboard + +go 1.22 + +require ( + github.com/NVIDIA/go-nvml v0.12.0-2 + github.com/go-chi/chi/v5 v5.0.12 + golang.org/x/net v0.21.0 +) + +require ( + github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf // indirect + golang.org/x/text v0.14.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..dacc585 --- /dev/null +++ b/go.sum @@ -0,0 +1,26 @@ +github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY= +github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-chi/chi/v5 v5.0.12 h1:9euLV5sTrTNTRUU9POmDUvfxyj6LAABLUcEWO+JJb4s= +github.com/go-chi/chi/v5 v5.0.12/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8= +github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf h1:FtEj8sfIcaaBfAKrE1Cwb61YDtYq9JxChK1c7AKce7s= +github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go new file mode 100644 index 0000000..5c678c9 --- /dev/null +++ b/main.go @@ -0,0 +1,25 @@ +package main + +import ( + "log" + "nvidiadashboard/api" + "nvidiadashboard/pkg/nvidia" +) + +func main() { + r := nvidia.New() + defer r.Close() + + log.Println("[INFO] NVIDIA driver loaded:", r.DriverVersion()) + + gpus, _ := r.GetGPUs() + log.Printf("[INFO] %d NVIDIA GPUs found", len(gpus)) + + s := api.New(r) + + log.Printf("[INFO] Server listening at :3000") + err := s.Serve(3000) + if err != nil { + log.Fatal(err) + } +} diff --git a/pkg/nvidia/nvidia.go b/pkg/nvidia/nvidia.go new file mode 100644 index 0000000..7654901 --- /dev/null +++ b/pkg/nvidia/nvidia.go @@ -0,0 +1,268 @@ +package nvidia + +import ( + "errors" + "fmt" + "os" + "strconv" + + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +type ( + Repository struct { + driverVersion string + cudaVersion string + } + + GPU struct { + Name string + UUID string + Index int + } + + GPUDetail struct { + GPU + CoreTemperature int + Utilization Utilization + Processes []Process + Memory Memory + Fans []Fan + } + + Memory struct { + Free int + Reserved int + Total int + Used int + Version int + } + + Process struct { + PID int + MemoryUsed int + Name string + Type string + } + + Utilization struct { + Decode int + Encode int + Rate int + } + + Fan struct { + Speed int + } +) + +var instance *Repository + +func (*Repository) Close() error { + ret := nvml.Shutdown() + if ret != nvml.SUCCESS { + return errors.New(nvml.ErrorString(ret)) + } + return nil +} + +func New() *Repository { + if instance == nil { + ret := nvml.Init() + if ret != nvml.SUCCESS { + panic("unable to initialize NVML: " + nvml.ErrorString(ret)) + } + driverVersion, ret := nvml.SystemGetDriverVersion() + if ret != nvml.SUCCESS { + panic("unable to initialize NVML: " + nvml.ErrorString(ret)) + } + cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2() + if ret != nvml.SUCCESS { + panic("unable to initialize NVML: " + nvml.ErrorString(ret)) + } + instance = &Repository{ + driverVersion: driverVersion, + cudaVersion: parseCUDA(cudaVersion), + } + } + return instance +} + +func (*Repository) GetGPUs() ([]GPU, error) { + count, ret := nvml.DeviceGetCount() + if ret != nvml.SUCCESS { + return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret)) + } + gpus := make([]GPU, 0, count) + for i := 0; i < count; i++ { + gpu, _, err := getGPU(i) + if err != nil { + return nil, err + } + gpus = append(gpus, gpu) + } + return gpus, nil +} + +func (*Repository) GetGPU(ID int) (GPUDetail, error) { + gpu, device, err := getGPU(ID) + if err != nil { + return GPUDetail{}, err + } + + fanCount, ret := device.GetNumFans() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + + fans := make([]Fan, 0, fanCount) + for i := 0; i < fanCount; i++ { + fdev, ret := device.GetFanSpeed_v2(i) + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + fan := Fan{ + Speed: int(fdev), + } + fans = append(fans, fan) + } + + temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU) + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + + load, ret := device.GetUtilizationRates() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + + decUsage, _, ret := device.GetDecoderUtilization() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + encUsage, _, ret := device.GetEncoderUtilization() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + + // Fetch all running process on the GPU + var allProcess []Process + + // Compute proc + proc, ret := device.GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + for _, p := range proc { + + sproc := Process{ + PID: int(p.Pid), + MemoryUsed: int(p.UsedGpuMemory), + Type: "Compute", + Name: "n/a", + } + allProcess = append(allProcess, sproc) + } + + // Graphics/3D procs + proc, ret = device.GetGraphicsRunningProcesses() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + for _, p := range proc { + sproc := Process{ + PID: int(p.Pid), + MemoryUsed: int(p.UsedGpuMemory), + Type: "Graphics", + Name: "n/a", + } + allProcess = append(allProcess, sproc) + } + + // MPS procs + proc, ret = device.GetMPSComputeRunningProcesses() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + for _, p := range proc { + sproc := Process{ + PID: int(p.Pid), + MemoryUsed: int(p.UsedGpuMemory), + Type: "MPSCompute", + Name: "n/a", + } + allProcess = append(allProcess, sproc) + } + + // Find process command line + for _, process := range allProcess { + if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil { + process.Name = string(cmdline) + } + } + + mem, ret := device.GetMemoryInfo_v2() + if ret != nvml.SUCCESS { + return GPUDetail{}, errors.New(nvml.ErrorString(ret)) + } + + return GPUDetail{ + GPU: gpu, + CoreTemperature: int(temp), + Utilization: Utilization{ + Decode: int(decUsage), + Encode: int(encUsage), + Rate: int(load.Gpu), + }, + Processes: allProcess, + Memory: Memory{ + Free: int(mem.Free), + Reserved: int(mem.Reserved), + Total: int(mem.Total), + Used: int(mem.Used), + Version: int(mem.Version), + }, + Fans: fans, + }, nil +} + +func (r *Repository) DriverVersion() string { + return r.driverVersion +} + +func (r *Repository) CUDAVersion() string { + return r.cudaVersion +} + +func parseCUDA(version int) string { + major := (int)(version / 1000) + minor := (int)((version - (major * 1000)) / 10) + + return fmt.Sprintf("%d.%d", major, minor) +} + +func getGPU(ID int) (GPU, nvml.Device, error) { + device, ret := nvml.DeviceGetHandleByIndex(ID) + if ret != nvml.SUCCESS { + return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) + } + + name, ret := device.GetName() + if ret != nvml.SUCCESS { + return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) + } + + uuid, ret := device.GetUUID() + if ret != nvml.SUCCESS { + return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret)) + } + + gpu := GPU{ + Name: name, + UUID: uuid, + Index: ID, + } + + return gpu, device, nil +} diff --git a/static b/static new file mode 160000 index 0000000..de2d3dc --- /dev/null +++ b/static @@ -0,0 +1 @@ +Subproject commit de2d3dc85f6b20ff0884373250b6fd36a31f633a