编辑
2024-05-30
开发
00
请注意,本文编写于 100 天前,最后修改于 100 天前,其中某些信息可能已经过时。

目录

背景
后端
前端

背景

总是被老板说组里的那么多服务器都没有利用起来,在我自己实际观察中发现确实存在着比较严重的服务器挤兑现象,某台服务器很多人用需要排队,而其他的服务器却基本上没有人用,借着组里很多师兄师姐都要毕业,准备做一套简单的系统来监控这些服务器的使用情况。

后端

对于GPU服务器,我们一般需要查看其GPU使用情况,可以通过

sh
nvidia-smi --query-gpu=name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits

这条命令将gpu的型号、温度、功率、使用率、显存等情况输出为一组csv的格式

对于显卡的占用进程,可以通过

sh
nvidia-smi pmon -s m -c 1

这条命令得到每张卡的占用进程PID、进程名、占用显存等

对于服务器内存,可以通过

sh
free -h

得到情况

对于cpu占用,可以通过

sh
mpstat 1 1

得到情况

对于cpu进程,可以通过ps命令得到

sh
sh -c ps aux --sort=-%cpu --no-header | head -n 50

对于IP地址,则可以通过golang内部的库函数得到。

综合以上信息,我们写一个简单的golang代码如下

go
package main import ( "encoding/json" "fmt" "net" "net/http" "os/exec" "strings" ) // GPUInfo represents detailed information of a GPU type GPUInfo struct { Model string `json:"model"` Temperature string `json:"temperature"` PowerDraw string `json:"power_draw"` UtilizationGPU string `json:"utilization_gpu"` MemoryUsed string `json:"memory_used"` MemoryTotal string `json:"memory_total"` } // GPUStatus represents the status of all GPUs type GPUStatus struct { GPUs []GPUInfo `json:"gpus"` } // GPUProcess represents a process using the GPU type GPUProcess struct { PID string `json:"pid"` Type string `json:"type"` Name string `json:"name"` UsedMem string `json:"used_memory"` } // MemoryInfo represents memory usage information type MemoryInfo struct { Total string `json:"total"` Used string `json:"used"` Free string `json:"free"` Shared string `json:"shared"` BuffCache string `json:"buff_cache"` Available string `json:"available"` } // CPUInfo represents CPU usage information type CPUInfo struct { User string `json:"user"` System string `json:"system"` Idle string `json:"idle"` } // ProcessInfo represents information about a process type ProcessInfo struct { PID string `json:"pid"` User string `json:"user"` CPU string `json:"cpu"` MEM string `json:"mem"` VSZ string `json:"vsz"` RSS string `json:"rss"` TTY string `json:"tty"` Stat string `json:"stat"` Start string `json:"start"` Time string `json:"time"` Cmd string `json:"cmd"` } // ServerStatus represents the server status type ServerStatus struct { Uptime string `json:"uptime"` Memory MemoryInfo `json:"memory"` CPU CPUInfo `json:"cpu"` Processes []ProcessInfo `json:"processes"` } // IPAddress represents the server's IP address type IPAddress struct { IP string `json:"ip"` } func getGPUStatus() (*GPUStatus, error) { cmd := exec.Command("nvidia-smi", "--query-gpu=name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits") output, err := cmd.Output() if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(output)), "\n") var gpus []GPUInfo for _, line := range lines { fields := strings.Split(line, ", ") if len(fields) != 6 { continue } gpus = append(gpus, GPUInfo{ Model: fields[0], Temperature: fields[1] + " C", PowerDraw: fields[2] + " W", UtilizationGPU: fields[3] + " %", MemoryUsed: fields[4] + " MiB", MemoryTotal: fields[5] + " MiB", }) } return &GPUStatus{GPUs: gpus}, nil } func getGPUProcesses() ([]GPUProcess, error) { cmd := exec.Command("nvidia-smi", "pmon", "-s", "m", "-c", "1") output, err := cmd.Output() if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(output)), "\n") var processes []GPUProcess for _, line := range lines[2:] { // skip the first two lines fields := strings.Fields(line) if len(fields) < 6 { continue } processes = append(processes, GPUProcess{ PID: fields[1], Type: fields[2], Name: fields[5], UsedMem: fields[3] + " MiB", }) } return processes, nil } func getMemoryInfo() (*MemoryInfo, error) { cmd := exec.Command("free", "-h") output, err := cmd.Output() if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(output)), "\n") if len(lines) < 2 { return nil, fmt.Errorf("unexpected output from free command") } fields := strings.Fields(lines[1]) if len(fields) < 7 { return nil, fmt.Errorf("unexpected output from free command") } return &MemoryInfo{ Total: fields[1], Used: fields[2], Free: fields[3], Shared: fields[4], BuffCache: fields[5], Available: fields[6], }, nil } func getCPUInfo() (*CPUInfo, error) { cmd := exec.Command("mpstat", "1", "1") output, err := cmd.Output() if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(output)), "\n") if len(lines) < 4 { return nil, fmt.Errorf("unexpected output from mpstat command") } fields := strings.Fields(lines[3]) if len(fields) < 12 { return nil, fmt.Errorf("unexpected output from mpstat command") } return &CPUInfo{ User: fields[3] + " %", System: fields[5] + " %", Idle: fields[11] + " %", }, nil } func getServerStatus() (*ServerStatus, error) { // Get system uptime uptimeCmd := exec.Command("uptime", "-p") uptimeOutput, err := uptimeCmd.Output() if err != nil { return nil, err } uptime := strings.TrimSpace(string(uptimeOutput)) // Get memory information memoryInfo, err := getMemoryInfo() if err != nil { return nil, err } // Get CPU information cpuInfo, err := getCPUInfo() if err != nil { return nil, err } // Get process information psCmd := exec.Command("sh", "-c", "ps aux --sort=-%cpu --no-header | head -n 50") psOutput, err := psCmd.Output() if err != nil { return nil, err } psLines := strings.Split(strings.TrimSpace(string(psOutput)), "\n") var processes []ProcessInfo for _, line := range psLines { fields := strings.Fields(line) if len(fields) < 11 { continue } processes = append(processes, ProcessInfo{ PID: fields[1], User: fields[0], CPU: fields[2], MEM: fields[3], VSZ: fields[4], RSS: fields[5], TTY: fields[6], Stat: fields[7], Start: fields[8], Time: fields[9], Cmd: strings.Join(fields[10:], " "), }) } return &ServerStatus{ Uptime: uptime, Memory: *memoryInfo, CPU: *cpuInfo, Processes: processes, }, nil } func getIPAddress() (*IPAddress, error) { addrs, err := net.InterfaceAddrs() if err != nil { return nil, err } for _, addr := range addrs { if ipNet, ok := addr.(*net.IPNet); ok && !ipNet.IP.IsLoopback() { if ipNet.IP.To4() != nil { return &IPAddress{IP: ipNet.IP.String()}, nil } } } return nil, fmt.Errorf("IP address not found") } func gpuHandler(w http.ResponseWriter, r *http.Request) { status, err := getGPUStatus() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } json.NewEncoder(w).Encode(status) } func gpuProcessesHandler(w http.ResponseWriter, r *http.Request) { processes, err := getGPUProcesses() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } json.NewEncoder(w).Encode(processes) } func serverHandler(w http.ResponseWriter, r *http.Request) { status, err := getServerStatus() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } json.NewEncoder(w).Encode(status) } func ipHandler(w http.ResponseWriter, r *http.Request) { ip, err := getIPAddress() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } json.NewEncoder(w).Encode(ip) } func heartbeatHandler(w http.ResponseWriter, r *http.Request) { w.Write([]byte("OK")) } func main() { http.HandleFunc("/gpu", gpuHandler) http.HandleFunc("/gpu/processes", gpuProcessesHandler) http.HandleFunc("/server", serverHandler) http.HandleFunc("/ip", ipHandler) http.HandleFunc("/heartbeat", heartbeatHandler) fmt.Println("Server started at :12802") http.ListenAndServe(":12802", nil) }

这份代码实现了五个api,分别完成了gpu的显存检测、进程检测、cpu的进程检测、内存检测、ip检测、以及心跳检测。

通过设置GOOS=linux;GOARCH=amd64在我的MacOS上交叉编译到linux,可以直接上传一个二进制包就完成部署,然后再通过linux service完成重启与自启动设置

toml
[Unit] Description=server monitor service After=network.target syslog.target Wants=network.target [Service] Type=simple ExecStart=/home/xuqi/server_monitor/go_build_server_monitor_linux Restart=always RestartSec=10 User=xuqi [Install] WantedBy=multi-user.target

最后再通过frp服务转发到公网(因为这些服务器都在内网,不做穿透无法访问到api)

就完成了服务器监测的后端代码了。

前端

前端则拟使用vue3框架 + element plus的组件库搭建,当然,还要使用tailwind css美化一下

简单起见,也做成一个单页面就好。

本文作者:insomnia

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!