添加 202510_定时监控vLLM进程.md
This commit is contained in:
parent
29da0b0ca8
commit
549ddf9944
121
202510_定时监控vLLM进程.md
Normal file
121
202510_定时监控vLLM进程.md
Normal file
@ -0,0 +1,121 @@
|
||||
## 修改vLLM运行gpt-oss-120b的脚本
|
||||
|
||||
在 nohup 的启动命令末尾追加如下片段,保存 vLLM 的进程 PID 号。
|
||||
|
||||
```
|
||||
echo $! > /hook/gpt-oss-120b.pid
|
||||
```
|
||||
|
||||
## 创建一个定时任务 bash 脚本
|
||||
|
||||
```
|
||||
vim /hook/timer_bash.sh
|
||||
```
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
|
||||
# 定义 PID 文件路径
|
||||
TARGET_PID_FILE="/hook/gpt-oss-120b.pid"
|
||||
TSARK_PID_FILE="/hook/tshark_bash.pid"
|
||||
|
||||
# 无限循环,每隔 5 分钟(300 秒)检查一次
|
||||
while true; do
|
||||
# 检查目标 PID 文件是否存在
|
||||
if [[ -f "$TARGET_PID_FILE" ]]; then
|
||||
TARGET_PID=$(cat "$TARGET_PID_FILE" 2>/dev/null)
|
||||
|
||||
# 检查读取到的 PID 是否为数字且进程是否存在
|
||||
if [[ "$TARGET_PID" =~ ^[0-9]+$ ]] && ps -p "$TARGET_PID" > /dev/null 2>&1; then
|
||||
echo "$(date): 目标进程 (PID: $TARGET_PID) 存活,执行相应操作..."
|
||||
|
||||
# 1. 杀掉 tshark_bash 进程
|
||||
if [[ -f "$TSARK_PID_FILE" ]]; then
|
||||
TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null)
|
||||
if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then
|
||||
kill -9 "$TSHARK_PID" 2>/dev/null
|
||||
echo "已强制杀死 tshark_bash 进程 (PID: $TSHARK_PID)"
|
||||
else
|
||||
echo "tshark_bash.pid 文件内容无效或为空,跳过 kill 操作"
|
||||
fi
|
||||
else
|
||||
echo "tshark_bash.pid 文件不存在,跳过 kill 操作"
|
||||
fi
|
||||
|
||||
# 2. 删除 shark.log
|
||||
if [[ -f "/hook/shark.log" ]]; then
|
||||
rm -rf "/hook/shark.log"
|
||||
echo "已删除 /hook/shark.log"
|
||||
else
|
||||
echo "/hook/shark.log 文件不存在,跳过删除"
|
||||
fi
|
||||
|
||||
# 3. 删除 /tmp/*.pcapng
|
||||
if ls /tmp/*.pcapng >/dev/null 2>&1; then
|
||||
rm -rf /tmp/*.pcapng
|
||||
echo "已删除 /tmp/*.pcapng 文件"
|
||||
else
|
||||
echo "没有找到 /tmp/*.pcapng 文件,跳过删除"
|
||||
fi
|
||||
|
||||
# 4. 启动新的 tshark_bash.sh 并记录 PID
|
||||
echo "正在启动新的 tshark_bash.sh..."
|
||||
sudo nohup bash /hook/tshark_bash.sh >> /hook/tshark_bash.log 2>&1 &
|
||||
NEW_TSHARK_PID=$!
|
||||
echo "$NEW_TSHARK_PID" > "$TSARK_PID_FILE"
|
||||
echo "已启动 tshark_bash.sh,新 PID: $NEW_TSHARK_PID,已写入 $TSARK_PID_FILE"
|
||||
|
||||
else
|
||||
echo "$(date): 目标进程 (PID: ${TARGET_PID:-未知}) 不存在或已挂掉,执行清理操作..."
|
||||
|
||||
# 1. 杀掉 tshark_bash 进程
|
||||
if [[ -f "$TSARK_PID_FILE" ]]; then
|
||||
TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null)
|
||||
if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then
|
||||
kill -9 "$TSHARK_PID" 2>/dev/null
|
||||
echo "已强制杀死 tshark_bash 进程 (PID: $TSHARK_PID)"
|
||||
else
|
||||
echo "tshark_bash.pid 文件内容无效或为空,跳过 kill 操作"
|
||||
fi
|
||||
else
|
||||
echo "tshark_bash.pid 文件不存在,跳过 kill 操作"
|
||||
fi
|
||||
|
||||
# 2. 删除 /tmp/*.pcapng
|
||||
if ls /tmp/*.pcapng >/dev/null 2>&1; then
|
||||
rm -rf /tmp/*.pcapng
|
||||
echo "已删除 /tmp/*.pcapng 文件"
|
||||
else
|
||||
echo "没有找到 /tmp/*.pcapng 文件,跳过删除"
|
||||
fi
|
||||
|
||||
echo "目标进程已挂掉,已执行清理操作。"
|
||||
fi
|
||||
else
|
||||
echo "$(date): 目标 PID 文件 $TARGET_PID_FILE 不存在,跳过本次检查。"
|
||||
fi
|
||||
|
||||
# 等待 5 分钟 (300 秒)
|
||||
sleep 300
|
||||
done
|
||||
```
|
||||
|
||||
## 运行定时任务脚本
|
||||
|
||||
##### 第一次运行先赋权
|
||||
|
||||
```
|
||||
chmod +x vim /hook/timer_bash.sh
|
||||
```
|
||||
|
||||
##### 运行脚本
|
||||
|
||||
```
|
||||
sudo nohup /hook/timer_bash.sh > /hook/timer_bash.log 2>&1 & echo $! > /hook/timer_bash.pid
|
||||
```
|
||||
|
||||
##### 停止脚本
|
||||
|
||||
```
|
||||
kill $(cat /hook/timer_bash.pid)
|
||||
```
|
||||
Loading…
Reference in New Issue
Block a user