总是被老板说组里的那么多服务器都没有利用起来,在我自己实际观察中发现确实存在着比较严重的服务器挤兑现象,某台服务器很多人用需要排队,而其他的服务器却基本上没有人用,借着组里很多师兄师姐都要毕业,准备做一套简单的系统来监控这些服务器的使用情况。
对于GPU服务器,我们一般需要查看其GPU使用情况,可以通过
shnvidia-smi --query-gpu=name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits
这条命令将gpu的型号、温度、功率、使用率、显存等情况输出为一组csv的格式
对于显卡的占用进程,可以通过
shnvidia-smi pmon -s m -c 1
这条命令得到每张卡的占用进程PID、进程名、占用显存等
对于服务器内存,可以通过
shfree -h
得到情况
对于cpu占用,可以通过
shmpstat 1 1
得到情况
对于cpu进程,可以通过ps命令得到
shsh -c ps aux --sort=-%cpu --no-header | head -n 50
对于IP地址,则可以通过golang内部的库函数得到。
综合以上信息,我们写一个简单的golang代码如下
gopackage main
import (
"encoding/json"
"fmt"
"net"
"net/http"
"os/exec"
"strings"
)
// GPUInfo represents detailed information of a GPU
type GPUInfo struct {
Model string `json:"model"`
Temperature string `json:"temperature"`
PowerDraw string `json:"power_draw"`
UtilizationGPU string `json:"utilization_gpu"`
MemoryUsed string `json:"memory_used"`
MemoryTotal string `json:"memory_total"`
}
// GPUStatus represents the status of all GPUs
type GPUStatus struct {
GPUs []GPUInfo `json:"gpus"`
}
// GPUProcess represents a process using the GPU
type GPUProcess struct {
PID string `json:"pid"`
Type string `json:"type"`
Name string `json:"name"`
UsedMem string `json:"used_memory"`
}
// MemoryInfo represents memory usage information
type MemoryInfo struct {
Total string `json:"total"`
Used string `json:"used"`
Free string `json:"free"`
Shared string `json:"shared"`
BuffCache string `json:"buff_cache"`
Available string `json:"available"`
}
// CPUInfo represents CPU usage information
type CPUInfo struct {
User string `json:"user"`
System string `json:"system"`
Idle string `json:"idle"`
}
// ProcessInfo represents information about a process
type ProcessInfo struct {
PID string `json:"pid"`
User string `json:"user"`
CPU string `json:"cpu"`
MEM string `json:"mem"`
VSZ string `json:"vsz"`
RSS string `json:"rss"`
TTY string `json:"tty"`
Stat string `json:"stat"`
Start string `json:"start"`
Time string `json:"time"`
Cmd string `json:"cmd"`
}
// ServerStatus represents the server status
type ServerStatus struct {
Uptime string `json:"uptime"`
Memory MemoryInfo `json:"memory"`
CPU CPUInfo `json:"cpu"`
Processes []ProcessInfo `json:"processes"`
}
// IPAddress represents the server's IP address
type IPAddress struct {
IP string `json:"ip"`
}
func getGPUStatus() (*GPUStatus, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits")
output, err := cmd.Output()
if err != nil {
return nil, err
}
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
var gpus []GPUInfo
for _, line := range lines {
fields := strings.Split(line, ", ")
if len(fields) != 6 {
continue
}
gpus = append(gpus, GPUInfo{
Model: fields[0],
Temperature: fields[1] + " C",
PowerDraw: fields[2] + " W",
UtilizationGPU: fields[3] + " %",
MemoryUsed: fields[4] + " MiB",
MemoryTotal: fields[5] + " MiB",
})
}
return &GPUStatus{GPUs: gpus}, nil
}
func getGPUProcesses() ([]GPUProcess, error) {
cmd := exec.Command("nvidia-smi", "pmon", "-s", "m", "-c", "1")
output, err := cmd.Output()
if err != nil {
return nil, err
}
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
var processes []GPUProcess
for _, line := range lines[2:] { // skip the first two lines
fields := strings.Fields(line)
if len(fields) < 6 {
continue
}
processes = append(processes, GPUProcess{
PID: fields[1],
Type: fields[2],
Name: fields[5],
UsedMem: fields[3] + " MiB",
})
}
return processes, nil
}
func getMemoryInfo() (*MemoryInfo, error) {
cmd := exec.Command("free", "-h")
output, err := cmd.Output()
if err != nil {
return nil, err
}
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
if len(lines) < 2 {
return nil, fmt.Errorf("unexpected output from free command")
}
fields := strings.Fields(lines[1])
if len(fields) < 7 {
return nil, fmt.Errorf("unexpected output from free command")
}
return &MemoryInfo{
Total: fields[1],
Used: fields[2],
Free: fields[3],
Shared: fields[4],
BuffCache: fields[5],
Available: fields[6],
}, nil
}
func getCPUInfo() (*CPUInfo, error) {
cmd := exec.Command("mpstat", "1", "1")
output, err := cmd.Output()
if err != nil {
return nil, err
}
lines := strings.Split(strings.TrimSpace(string(output)), "\n")
if len(lines) < 4 {
return nil, fmt.Errorf("unexpected output from mpstat command")
}
fields := strings.Fields(lines[3])
if len(fields) < 12 {
return nil, fmt.Errorf("unexpected output from mpstat command")
}
return &CPUInfo{
User: fields[3] + " %",
System: fields[5] + " %",
Idle: fields[11] + " %",
}, nil
}
func getServerStatus() (*ServerStatus, error) {
// Get system uptime
uptimeCmd := exec.Command("uptime", "-p")
uptimeOutput, err := uptimeCmd.Output()
if err != nil {
return nil, err
}
uptime := strings.TrimSpace(string(uptimeOutput))
// Get memory information
memoryInfo, err := getMemoryInfo()
if err != nil {
return nil, err
}
// Get CPU information
cpuInfo, err := getCPUInfo()
if err != nil {
return nil, err
}
// Get process information
psCmd := exec.Command("sh", "-c", "ps aux --sort=-%cpu --no-header | head -n 50")
psOutput, err := psCmd.Output()
if err != nil {
return nil, err
}
psLines := strings.Split(strings.TrimSpace(string(psOutput)), "\n")
var processes []ProcessInfo
for _, line := range psLines {
fields := strings.Fields(line)
if len(fields) < 11 {
continue
}
processes = append(processes, ProcessInfo{
PID: fields[1],
User: fields[0],
CPU: fields[2],
MEM: fields[3],
VSZ: fields[4],
RSS: fields[5],
TTY: fields[6],
Stat: fields[7],
Start: fields[8],
Time: fields[9],
Cmd: strings.Join(fields[10:], " "),
})
}
return &ServerStatus{
Uptime: uptime,
Memory: *memoryInfo,
CPU: *cpuInfo,
Processes: processes,
}, nil
}
func getIPAddress() (*IPAddress, error) {
addrs, err := net.InterfaceAddrs()
if err != nil {
return nil, err
}
for _, addr := range addrs {
if ipNet, ok := addr.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil {
return &IPAddress{IP: ipNet.IP.String()}, nil
}
}
}
return nil, fmt.Errorf("IP address not found")
}
func gpuHandler(w http.ResponseWriter, r *http.Request) {
status, err := getGPUStatus()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(w).Encode(status)
}
func gpuProcessesHandler(w http.ResponseWriter, r *http.Request) {
processes, err := getGPUProcesses()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(w).Encode(processes)
}
func serverHandler(w http.ResponseWriter, r *http.Request) {
status, err := getServerStatus()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(w).Encode(status)
}
func ipHandler(w http.ResponseWriter, r *http.Request) {
ip, err := getIPAddress()
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
json.NewEncoder(w).Encode(ip)
}
func heartbeatHandler(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("OK"))
}
func main() {
http.HandleFunc("/gpu", gpuHandler)
http.HandleFunc("/gpu/processes", gpuProcessesHandler)
http.HandleFunc("/server", serverHandler)
http.HandleFunc("/ip", ipHandler)
http.HandleFunc("/heartbeat", heartbeatHandler)
fmt.Println("Server started at :12802")
http.ListenAndServe(":12802", nil)
}
这份代码实现了五个api,分别完成了gpu的显存检测、进程检测、cpu的进程检测、内存检测、ip检测、以及心跳检测。
通过设置GOOS=linux;GOARCH=amd64
在我的MacOS上交叉编译到linux,可以直接上传一个二进制包就完成部署,然后再通过linux service完成重启与自启动设置
toml[Unit]
Description=server monitor service
After=network.target syslog.target
Wants=network.target
[Service]
Type=simple
ExecStart=/home/xuqi/server_monitor/go_build_server_monitor_linux
Restart=always
RestartSec=10
User=xuqi
[Install]
WantedBy=multi-user.target
最后再通过frp服务转发到公网(因为这些服务器都在内网,不做穿透无法访问到api)
就完成了服务器监测的后端代码了。
前端则拟使用vue3框架 + element plus的组件库搭建,当然,还要使用tailwind css美化一下
简单起见,也做成一个单页面就好。
本文作者:insomnia
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!