Start project

This commit is contained in:
2024-02-23 20:24:27 +01:00
commit de91d6f3e8
7 changed files with 533 additions and 0 deletions

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "static"]
path = static
url = https://git.thelilfrog.com/thelilfrog/startbootstrap-sb-admin-2

196
api/api.go Normal file
View File

@@ -0,0 +1,196 @@
package api
import (
"fmt"
"html/template"
"log"
"net/http"
"nvidiadashboard/pkg/nvidia"
"os"
"path/filepath"
"strings"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/inhies/go-bytesize"
"golang.org/x/net/http2"
"golang.org/x/net/http2/h2c"
)
type (
NVIDIARepository interface {
GetGPU(ID int) (nvidia.GPUDetail, error)
GetGPUs() ([]nvidia.GPU, error)
DriverVersion() string
CUDAVersion() string
}
Server struct {
repo NVIDIARepository
mux *chi.Mux
errLog *log.Logger
}
WebPack struct {
GPUs []nvidia.GPU
GPU nvidia.GPUDetail
Username string
DriverVersion string
CUDAVersion string
}
)
var (
templateFuncMap = template.FuncMap{
"ConvertByteSize": convertByteSize,
"PercentageRounded": percentageRounded,
}
)
const internalServerErrorPage string = `<!DOCTYPE html>
<html lang="en">
<head>
<title>Internal Server Error</title>
</head>
<body>
<h1>Internal Server Error</h1>
<p>%v</p>
<hr/>
<small>nvidia-smi dashboard</small>
</body>
</html>`
func New(repo NVIDIARepository) *Server {
s := &Server{
mux: chi.NewRouter(),
repo: repo,
errLog: log.New(os.Stderr, log.Prefix(), log.Flags()),
}
s.mux.Use(middleware.RequestID)
s.mux.Use(middleware.Logger)
s.mux.Use(middleware.Recoverer)
s.mux.Use(middleware.URLFormat)
s.mux.Get("/", s.handleRoot)
s.mux.Get("/{uuid:GPU-(.*)}", s.handleGPU)
s.mux.Get("/{path:(.*).html}", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
w.Write([]byte("404 page not found"))
})
workDir, _ := os.Getwd()
filesDir := http.Dir(filepath.Join(workDir, "static"))
fileServer(s.mux, "/", filesDir)
return s
}
func (s *Server) Serve(port uint) error {
h2s := &http2.Server{}
srv := &http.Server{
Addr: fmt.Sprintf("0.0.0.0:%v", port),
Handler: h2c.NewHandler(s.mux, h2s), // HTTP/2 Cleartext handler
}
return srv.ListenAndServe()
}
func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
gpus, err := s.repo.GetGPUs()
if err != nil {
sendInternalServerErrorHTML(w, err)
return
}
if len(gpus) > 0 {
w.Header().Add("Location", "/"+gpus[0].UUID)
w.WriteHeader(http.StatusTemporaryRedirect)
return
}
wp := WebPack{
Username: "anonymous",
}
t, err := template.ParseFiles("static/no_gpu.html")
if err != nil {
sendInternalServerErrorHTML(w, err)
}
err = t.Execute(w, wp)
if err != nil {
s.errLog.Println("[ERROR]", err)
}
}
func (s *Server) handleGPU(w http.ResponseWriter, r *http.Request) {
uuid := chi.URLParam(r, "uuid")
gpus, err := s.repo.GetGPUs()
if err != nil {
sendInternalServerErrorHTML(w, err)
return
}
i := -1
for _, gpu := range gpus {
if gpu.UUID == uuid {
i = gpu.Index
}
}
if i == -1 {
w.WriteHeader(http.StatusNotFound)
w.Write([]byte("404 page not found"))
}
gpu, err := s.repo.GetGPU(i)
if err != nil {
sendInternalServerErrorHTML(w, err)
return
}
wp := WebPack{
Username: "anonymous",
GPUs: gpus,
GPU: gpu,
DriverVersion: s.repo.DriverVersion(),
CUDAVersion: s.repo.CUDAVersion(),
}
t, err := template.New("index").Funcs(templateFuncMap).ParseFiles("static/index.html")
if err != nil {
sendInternalServerErrorHTML(w, err)
}
err = t.ExecuteTemplate(w, "index.html", wp)
if err != nil {
s.errLog.Println("[ERROR]", err)
}
}
// FileServer conveniently sets up a http.FileServer handler to serve
// static files from a http.FileSystem.
func fileServer(r chi.Router, path string, root http.FileSystem) {
if strings.ContainsAny(path, "{}*") {
panic("FileServer does not permit any URL parameters.")
}
if path != "/" && path[len(path)-1] != '/' {
r.Get(path, http.RedirectHandler(path+"/", http.StatusMovedPermanently).ServeHTTP)
path += "/"
}
path += "*"
r.Get(path, func(w http.ResponseWriter, r *http.Request) {
rctx := chi.RouteContext(r.Context())
pathPrefix := strings.TrimSuffix(rctx.RoutePattern(), "/*")
fs := http.StripPrefix(pathPrefix, http.FileServer(root))
fs.ServeHTTP(w, r)
})
}
func sendInternalServerErrorHTML(w http.ResponseWriter, v any) {
w.WriteHeader(http.StatusInternalServerError)
body := fmt.Sprintf(internalServerErrorPage, v)
w.Write([]byte(body))
}
func convertByteSize(v int) string {
bs := bytesize.New(float64(v))
return bs.String()
}
func percentageRounded(a, b int) int {
p := (a / b) * 100
return p
}

14
go.mod Normal file
View File

@@ -0,0 +1,14 @@
module nvidiadashboard
go 1.22
require (
github.com/NVIDIA/go-nvml v0.12.0-2
github.com/go-chi/chi/v5 v5.0.12
golang.org/x/net v0.21.0
)
require (
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf // indirect
golang.org/x/text v0.14.0 // indirect
)

26
go.sum Normal file
View File

@@ -0,0 +1,26 @@
github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY=
github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-chi/chi/v5 v5.0.12 h1:9euLV5sTrTNTRUU9POmDUvfxyj6LAABLUcEWO+JJb4s=
github.com/go-chi/chi/v5 v5.0.12/go.mod h1:DslCQbL2OYiznFReuXYUmQ2hGd1aDpCnlMNITLSKoi8=
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf h1:FtEj8sfIcaaBfAKrE1Cwb61YDtYq9JxChK1c7AKce7s=
github.com/inhies/go-bytesize v0.0.0-20220417184213-4913239db9cf/go.mod h1:yrqSXGoD/4EKfF26AOGzscPOgTTJcyAwM2rpixWT+t4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

25
main.go Normal file
View File

@@ -0,0 +1,25 @@
package main
import (
"log"
"nvidiadashboard/api"
"nvidiadashboard/pkg/nvidia"
)
func main() {
r := nvidia.New()
defer r.Close()
log.Println("[INFO] NVIDIA driver loaded:", r.DriverVersion())
gpus, _ := r.GetGPUs()
log.Printf("[INFO] %d NVIDIA GPUs found", len(gpus))
s := api.New(r)
log.Printf("[INFO] Server listening at :3000")
err := s.Serve(3000)
if err != nil {
log.Fatal(err)
}
}

268
pkg/nvidia/nvidia.go Normal file
View File

@@ -0,0 +1,268 @@
package nvidia
import (
"errors"
"fmt"
"os"
"strconv"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type (
Repository struct {
driverVersion string
cudaVersion string
}
GPU struct {
Name string
UUID string
Index int
}
GPUDetail struct {
GPU
CoreTemperature int
Utilization Utilization
Processes []Process
Memory Memory
Fans []Fan
}
Memory struct {
Free int
Reserved int
Total int
Used int
Version int
}
Process struct {
PID int
MemoryUsed int
Name string
Type string
}
Utilization struct {
Decode int
Encode int
Rate int
}
Fan struct {
Speed int
}
)
var instance *Repository
func (*Repository) Close() error {
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}
func New() *Repository {
if instance == nil {
ret := nvml.Init()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
driverVersion, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
cudaVersion, ret := nvml.SystemGetCudaDriverVersion_v2()
if ret != nvml.SUCCESS {
panic("unable to initialize NVML: " + nvml.ErrorString(ret))
}
instance = &Repository{
driverVersion: driverVersion,
cudaVersion: parseCUDA(cudaVersion),
}
}
return instance
}
func (*Repository) GetGPUs() ([]GPU, error) {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return nil, errors.New("unable to get device count: " + nvml.ErrorString(ret))
}
gpus := make([]GPU, 0, count)
for i := 0; i < count; i++ {
gpu, _, err := getGPU(i)
if err != nil {
return nil, err
}
gpus = append(gpus, gpu)
}
return gpus, nil
}
func (*Repository) GetGPU(ID int) (GPUDetail, error) {
gpu, device, err := getGPU(ID)
if err != nil {
return GPUDetail{}, err
}
fanCount, ret := device.GetNumFans()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
fans := make([]Fan, 0, fanCount)
for i := 0; i < fanCount; i++ {
fdev, ret := device.GetFanSpeed_v2(i)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
fan := Fan{
Speed: int(fdev),
}
fans = append(fans, fan)
}
temp, ret := device.GetTemperature(nvml.TEMPERATURE_GPU)
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
load, ret := device.GetUtilizationRates()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
decUsage, _, ret := device.GetDecoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
encUsage, _, ret := device.GetEncoderUtilization()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
// Fetch all running process on the GPU
var allProcess []Process
// Compute proc
proc, ret := device.GetComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Compute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Graphics/3D procs
proc, ret = device.GetGraphicsRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "Graphics",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// MPS procs
proc, ret = device.GetMPSComputeRunningProcesses()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
for _, p := range proc {
sproc := Process{
PID: int(p.Pid),
MemoryUsed: int(p.UsedGpuMemory),
Type: "MPSCompute",
Name: "n/a",
}
allProcess = append(allProcess, sproc)
}
// Find process command line
for _, process := range allProcess {
if cmdline, err := os.ReadFile("/proc/" + strconv.Itoa(process.PID) + "/cmdline"); err == nil {
process.Name = string(cmdline)
}
}
mem, ret := device.GetMemoryInfo_v2()
if ret != nvml.SUCCESS {
return GPUDetail{}, errors.New(nvml.ErrorString(ret))
}
return GPUDetail{
GPU: gpu,
CoreTemperature: int(temp),
Utilization: Utilization{
Decode: int(decUsage),
Encode: int(encUsage),
Rate: int(load.Gpu),
},
Processes: allProcess,
Memory: Memory{
Free: int(mem.Free),
Reserved: int(mem.Reserved),
Total: int(mem.Total),
Used: int(mem.Used),
Version: int(mem.Version),
},
Fans: fans,
}, nil
}
func (r *Repository) DriverVersion() string {
return r.driverVersion
}
func (r *Repository) CUDAVersion() string {
return r.cudaVersion
}
func parseCUDA(version int) string {
major := (int)(version / 1000)
minor := (int)((version - (major * 1000)) / 10)
return fmt.Sprintf("%d.%d", major, minor)
}
func getGPU(ID int) (GPU, nvml.Device, error) {
device, ret := nvml.DeviceGetHandleByIndex(ID)
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
name, ret := device.GetName()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
uuid, ret := device.GetUUID()
if ret != nvml.SUCCESS {
return GPU{}, nvml.Device{}, errors.New(nvml.ErrorString(ret))
}
gpu := GPU{
Name: name,
UUID: uuid,
Index: ID,
}
return gpu, device, nil
}

1
static Submodule

Submodule static added at de2d3dc85f