From b79bd8bbc452a861e3460657aab78eb267cb52cc Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 11:14:26 +0800 Subject: [PATCH 01/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2020251011.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251011.md | 285 ++-------------------------------------------------- 1 file changed, 7 insertions(+), 278 deletions(-) diff --git a/20251011.md b/20251011.md index 21f16be..3960ba1 100644 --- a/20251011.md +++ b/20251011.md @@ -2,292 +2,21 @@ ## 问题原因 -vllm在处理请求时,调用了 get_structured_output_key 函数,在处理 structured_outputs 参数时,由于不满足所有的情况,导致了抛出**No valid structured output parameter found**异常,该异常未被 EngineCore 捕获导致引擎崩溃,APIServer 发现引擎死了,自己也退出了进程。 - -## 使用抓包工具 tshark 监控 http 请求 - -### 在合适的目录下执行下面的命令,tshark 需要 root 权限,请使用 root 账号执行 - -##### 安装 tshark(已执行完毕) -``` -sudo apt-get install tshark -``` - -##### 创建一个 shell 脚本 -``` -vim hook_vllm_gpt-oss-120b.sh -``` -脚本内容如下: -``` -#!/bin/bash - -# ======================================== -# 监控本地 v1/chat/completions 接口的 HTTP 请求与响应 -# 仅保留最近 20 条日志(含流式响应,如 SSE) -# 使用 tshark + TCP 流跟踪 -# ======================================== - -# 配置 -INTERFACE="lo" # 本地回环接口 -PORT="8077" -ENDPOINT="/v1/chat/completions" -LOG_FILE="/hook/chat_completions.log" -TEMP_LOG="/hook/chat_completions.tmp" -PID_FILE="/hook/hook_vllm_gpt-oss-120b.pid" - -# 检查是否已运行 -if [ -f "$PID_FILE" ]; then - if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then - echo "【错误】监控脚本已在运行 (PID: $(cat $PID_FILE))" - exit 1 - else - rm -f "$PID_FILE" - fi -fi - -# 记录 PID -echo $$ > "$PID_FILE" - -# 清空日志 -> "$LOG_FILE" - -echo "✅ 开始监控 http://127.0.0.1:$PORT$ENDPOINT" -echo "📝 日志将保存到: $LOG_FILE" -echo "⏳ 仅保留最近 20 条,按 Ctrl+C 停止" - -# 信号处理:清理 PID 文件 -trap 'rm -f "$PID_FILE"; echo "⏹️ 监控已停止"; exit 0' SIGINT SIGTERM - -# 使用 tshark 跟踪 TCP 流 -sudo tshark \ - -i "$INTERFACE" \ - -f "tcp port $PORT and host 127.0.0.1" \ - -q \ - -z "follow,tcp,ascii" \ - 2>/dev/null | \ -stdbuf -oL awk -v endpoint="$ENDPOINT" -v log="$LOG_FILE" -v temp="$TEMP" ' -BEGIN { - RS = "\n\n" - ORS = "" - in_request = 0 - buffer = "" - count = 0 -} - -# 分割流,识别每条 TCP 流 -{ - if (match($0, /GET|POST|PUT|DELETE|HTTP/) && index($0, endpoint)) { - # 提取时间戳(tshark 输出第一行包含时间) - if (match($0, /Following.*on port [0-9]+$/)) { - ts = substr($0, RSTART, RLENGTH) - gsub(/.*on/, "on", ts) - } else { - ts = "unknown time" - } - - # 提取请求行和头 - split($0, lines, /\n/) - for (i=1; i<=length(lines); i++) { - if (lines[i] ~ /(GET|POST|PUT|DELETE)/ && index(lines[i], endpoint)) { - request_line = lines[i] - } - if (lines[i] ~ /Content-Type:/ || lines[i] ~ /Authorization:/ || lines[i] ~ /User-Agent:/) { - headers = headers "\n " lines[i] - } - } - - # 提取请求体(通常在空行后) - body = "" - in_body = 0 - for (i=1; i<=length(lines); i++) { - if (lines[i] == "" || lines[i] ~ /HTTP\/[0-9.]/) { - in_body = 1 - continue - } - if (in_body && lines[i] !~ /(No response found|Following)/) { - body = body lines[i] "\n" - } - } - - # 提取响应部分(HTTP/ 开头) - response = "" - for (i=1; i<=length(lines); i++) { - if (lines[i] ~ /^HTTP\// && i > 1) { - for (j=i; j<=length(lines); j++) { - if (lines[j] !~ /Following/) { - response = response lines[j] "\n" - } - } - break - } - } - - # 构造日志条目 - entry = "========================================\n" - entry = entry "🕒 " ts "\n" - entry = entry "📤 请求: " request_line "\n" - if (headers != "") { - entry = entry "📎 头部:\n" headers "\n" - } - if (body != "") { - entry = entry "📦 请求体:\n" body "\n" - } - if (response != "") { - entry = entry "📥 响应:\n" response "\n" - } - entry = entry "========================================\n\n" - - # 写入日志并保留最近 20 条 - cmd = "echo -e \"" entry "\" >> " log " && tail -n 200 " log " > " temp " && mv " temp " " log - system(cmd) - - # 重置 - headers = "" - body = "" - response = "" - } -} -' - -# 正常退出时清理 -rm -f "$PID_FILE" -``` - -##### 赋予执行权限 -``` -chmod +x hook_vllm_gpt-oss-120b.sh -``` - -##### 后台运行脚本 -``` -nohup /hook/hook_vllm_gpt-oss-120b.sh > /dev/null 2>&1 & -``` - -##### 查看请求日志 - -``` -tail -f /hook/chat_completions.log -``` - -##### 停止脚本 - -``` -pkill -f hook_vllm_gpt-oss-120b.sh -``` - -### 函数 get_structured_output_key 实现 - -#### 分支:release/v0.11.0 +由于外部调用 vllm 的 OpenAI API 服务时候,传入的请求参数让 vllm 调用了**get_structured_output_key**函数。在该函数里不能被正确处理,抛出了**No valid structured output parameter found**异常,该异常导致了 vllm 的 EngineCore 和 APIServer 进程死亡。 +## 源码出处 https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/v1/structured_output/request.py -```python -def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.structured_outputs - assert params is not None, "params can't be None." - if params.json is not None: - if not isinstance(params.json, str): - json_str = json.dumps(params.json) - else: - json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: - if not isinstance(params.choice, str): - json_str = json.dumps(params.choice) - else: - json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") -``` -#### 分支:release/v0.10.2 +## 问题追踪 -https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/v1/structured_output/request.py +由于 vllm 没有提供 http 请求参数的日志打印,也没有集成监控 http 请求的三方工具。所以在 Ubuntu 上安装了 tshark 抓包工具。通过 Java 脚本启动 tshark 命令,并将 tshark 抓包到的日志内容写入到磁盘文件。下一次 vllm 崩溃时,根据磁盘文件存储的日志内容分析是什么参数导致了**get_structured_output_key**的异常。 -```python -def get_structured_output_key( - sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.guided_decoding - assert params is not None, "params can't be None." - if params.json is not None: - if not isinstance(params.json, str): - json_str = json.dumps(params.json) - else: - json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: - if not isinstance(params.choice, str): - json_str = json.dumps(params.choice) - else: - json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") -``` -> 2个版本的唯一区别,仅仅是提取`SamplingParams`的属性不一样,其它判断逻辑都是一致的 +> 使用 Java 执行 tshark 是为了抹掉 Linux 和 Windows 的平台差异,不用修改代码和命令即可直接运行。 -### SamplingParams +> Java 脚本内容: -https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/sampling_params.py - -```python -# 其它参数已省略 -class SamplingParams( - 💡初始化逻辑是没问题的,默认值是None - structured_outputs: Optional[StructuredOutputsParams] = None - guided_decoding: Optional[GuidedDecodingParams] = None - - @staticmethod - def from_optional( - 💡默认值也是None - structured_outputs: Optional[StructuredOutputsParams] = None, - ) -> "SamplingParams": - if guided_decoding is not None: - warnings.warn( - "guided_decoding is deprecated. This will be removed in " - 💡官方将在 v0.12.0 废弃 guided_decoding 参数,使用 structured_outputs 参数替代,在 v0.11.0 版本做了兼容, - "v0.12.0 or v1.0.0, which ever is soonest. Please use " - "structured_outputs instead.", - DeprecationWarning, - stacklevel=2) - structured_outputs = guided_decoding - guided_decoding = None - return SamplingParams( - structured_outputs=structured_outputs, - ) -``` - -### StructuredOutputOptions - -```python -class StructuredOutputOptions(enum.Enum): - JSON = enum.auto() - JSON_OBJECT = enum.auto() - REGEX = enum.auto() - GRAMMAR = enum.auto() - CHOICE = enum.auto() - STRUCTURAL_TAG = enum.auto() -``` -> 💡只支持这6种类型,每个类型都对应 structured_outputs 下面的一个不同的参数。 - -## 崩溃日志片段 +## 崩溃日志 ```text (EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] EngineCore encountered a fatal error. From 108e1aa86cb9aa36accc83929568b6fc155c7e19 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:00:00 +0800 Subject: [PATCH 02/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=2020251017.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251017.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 20251017.md diff --git a/20251017.md b/20251017.md new file mode 100644 index 0000000..3fdc173 --- /dev/null +++ b/20251017.md @@ -0,0 +1 @@ +122 \ No newline at end of file From a81ec5d0bcbc07461d3011d976ae9fa8fe1c058c Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:00:26 +0800 Subject: [PATCH 03/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2020251017.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251017.md | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/20251017.md b/20251017.md index 3fdc173..50aa383 100644 --- a/20251017.md +++ b/20251017.md @@ -1 +1,143 @@ -122 \ No newline at end of file +## 脚本说明 + +- 在 Linux 运行`TShark.java`必须先安装**tshark**,在 Windows 上运行必须先安装**Wireshark**客户端,在安装时勾选**tshark**组件。 +- 必须使用超级管理员账号运行`TShark.java`,这是**tshark**命令的限制,在 Windows 上可以使用管理员权限打开命令行运行。 +- Java 脚本基于**Java21**开发,可直接使用`java TShark.java`运行,运行后会在当前目录下创建一个**shark.log**文件,保存抓包日志。 +- 建议创建一个 bash 或者 cmd 脚本,在 bash 或 cmd 脚本中配置好 JDK 环境变量,然后再使用管理员权限运行 bash 或 cmd 脚本。 + +```java +import java.io.*; +import java.nio.file.Path; +import java.time.ZoneId; +import java.util.List; +import java.util.TimeZone; + +public final class TShark { + + public static void main(String[] args) throws Exception { + TimeZone.setDefault(TimeZone.getTimeZone(ZoneId.of("Asia/Shanghai"))); + Path path = Path.of("shark.log"); + File sharkLog = path.toFile(); + if (!sharkLog.exists() && !sharkLog.createNewFile()) { + throw new RuntimeException("create shark.log failure"); + } +// DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + Process process = startSharkProcess(); + System.out.println("start tshark success"); + startProcessSharkThread(process, sharkLog); + System.out.println("start process thread success"); + int exitCode = process.waitFor(); + System.out.println("tshark process is dead, exit code: " + exitCode); + } + + private static void startProcessSharkThread(Process process, File sharkLog) { + Thread thread = new Thread(() -> { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream())); + BufferedWriter fileWriter = new BufferedWriter(new FileWriter(sharkLog, true))) { + String line; + while ((line = reader.readLine()) != null) { + fileWriter.write(line + System.lineSeparator()); + } + fileWriter.flush(); + } catch (Exception e) { + e.printStackTrace(); + } + }); + thread.setName("TShark-thread"); + thread.start(); + } + + public static Process startSharkProcess() throws IOException { + List cmd = List.of( + "tshark", + "-l", + "-i", + "\\Device\\NPF_{807C63AC-179D-4AC8-BD56-85CE8AA179DB}", + "-Y", + "tcp.port == 33000 && http.request.method == \"POST\"", + "-V" + ); + ProcessBuilder processBuilder = new ProcessBuilder(cmd); + processBuilder.redirectErrorStream(true); + return processBuilder.start(); + } +} +``` + +##### 通过 NEWAPI 调用 vLLM 运行的 gpt-oss-120b 模型 + +```java +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.concurrent.CompletableFuture; + +public class HttpPost { + + public static void main(String[] args) { + try (HttpClient httpClient = HttpClient.newHttpClient()) { + HttpRequest httpRequest = HttpRequest.newBuilder() + .uri(URI.create(apiUrl)) + .header("Content-Type", "application/json") + .header("Authorization", apiKey) + .timeout(Duration.ofMinutes(5L)) + .POST(HttpRequest.BodyPublishers.ofString(request)) + .build(); + CompletableFuture completableFuture = httpClient.sendAsync(httpRequest, HttpResponse.BodyHandlers.ofLines()).thenAccept(response -> { + System.out.println("SSE success status " + response.statusCode() + ", response body: "); + response.body().forEach(System.out::println); + }).exceptionally(throwable -> { + System.out.println("SSE failure error message: " + throwable.getMessage()); + throwable.printStackTrace(); + return null; + }); + try { + completableFuture.get(); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + static final String apiUrl = "http://10.159.252.49:33000/v1/chat/completions"; + + static final String apiKey = "Bearer sk-rbWEGdsaZ47e2hQFZt2xRHqWRZaipYlkyLqHdU2z9FlWj7D3"; + + static final String request = """ + { + "model": "gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "成龙今年71岁,请把这句话的信息,按照`user_response`结构生成一个JSON" + } + ], + "stream": true, + "extra_body":{ + "structured_outputs": true, + "guided_decoding": true + }, + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "user_response", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "number"} + }, + "required": ["name", "age"], + "additionalProperties": false + }, + "strict": true + } + } + } + """.strip(); + +} + +``` \ No newline at end of file From 52a3ad872f2e30874a44b291c21d11d66379ab59 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:00:52 +0800 Subject: [PATCH 04/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2020251017.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251017.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/20251017.md b/20251017.md index 50aa383..d7a16fd 100644 --- a/20251017.md +++ b/20251017.md @@ -5,6 +5,8 @@ - Java 脚本基于**Java21**开发,可直接使用`java TShark.java`运行,运行后会在当前目录下创建一个**shark.log**文件,保存抓包日志。 - 建议创建一个 bash 或者 cmd 脚本,在 bash 或 cmd 脚本中配置好 JDK 环境变量,然后再使用管理员权限运行 bash 或 cmd 脚本。 +##### TShark.java + ```java import java.io.*; import java.nio.file.Path; From 8617dc7e84d55752e3f2ee8eab21fb2f9be0b325 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:05:05 +0800 Subject: [PATCH 05/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20=E5=A4=9A=E6=99=BA?= =?UTF-8?q?=E8=83=BD=E4=BD=93=E5=BC=80=E5=8F=91=E6=8A=80=E6=9C=AF=E6=96=B9?= =?UTF-8?q?=E6=A1=88=E5=AF=B9=E6=AF=94.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251010.md => 多智能体开发技术方案对比.md | 2 -- 1 file changed, 2 deletions(-) rename 20251010.md => 多智能体开发技术方案对比.md (99%) diff --git a/20251010.md b/多智能体开发技术方案对比.md similarity index 99% rename from 20251010.md rename to 多智能体开发技术方案对比.md index 77d72f2..7695cdd 100644 --- a/20251010.md +++ b/多智能体开发技术方案对比.md @@ -1,5 +1,3 @@ -# 多智能体开发技术方案对比 - ## LLM基础应用框架评测对象 - Spring AI Alibaba(SAA) From 98ea7aea236ba0e2da852b180ae556c05566f219 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:05:41 +0800 Subject: [PATCH 06/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20=E6=8E=92=E6=9F=A5vL?= =?UTF-8?q?LM=E5=B4=A9=E6=BA=83=E9=97=AE=E9=A2=98.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251011.md => 排查vLLM崩溃问题.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 20251011.md => 排查vLLM崩溃问题.md (100%) diff --git a/20251011.md b/排查vLLM崩溃问题.md similarity index 100% rename from 20251011.md rename to 排查vLLM崩溃问题.md From caf952a13c0c04e8bd7a42a13683c15bbc72ff97 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:05:51 +0800 Subject: [PATCH 07/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20=E6=8E=92=E6=9F=A5vL?= =?UTF-8?q?LM=E5=B4=A9=E6=BA=83=E9=97=AE=E9=A2=98.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 排查vLLM崩溃问题.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/排查vLLM崩溃问题.md b/排查vLLM崩溃问题.md index 3960ba1..4b5f86a 100644 --- a/排查vLLM崩溃问题.md +++ b/排查vLLM崩溃问题.md @@ -1,5 +1,3 @@ -# vllm 崩溃问题排查 - ## 问题原因 由于外部调用 vllm 的 OpenAI API 服务时候,传入的请求参数让 vllm 调用了**get_structured_output_key**函数。在该函数里不能被正确处理,抛出了**No valid structured output parameter found**异常,该异常导致了 vllm 的 EngineCore 和 APIServer 进程死亡。 From 980e3041559d5acd92501477e3358c529c493568 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:07:11 +0800 Subject: [PATCH 08/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=200=5F=E8=B7=9FDocker?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=E7=9A=84=E8=84=9A=E6=9C=AC.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251013.md => 0_跟Docker相关的脚本.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 20251013.md => 0_跟Docker相关的脚本.md (100%) diff --git a/20251013.md b/0_跟Docker相关的脚本.md similarity index 100% rename from 20251013.md rename to 0_跟Docker相关的脚本.md From 22c131fde053982bb250c3e49cd7eb05d3341156 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:07:22 +0800 Subject: [PATCH 09/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=200=5F=E8=B7=9FDocker?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=E7=9A=84=E8=84=9A=E6=9C=AC.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 0_跟Docker相关的脚本.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/0_跟Docker相关的脚本.md b/0_跟Docker相关的脚本.md index 125931d..2c755cb 100644 --- a/0_跟Docker相关的脚本.md +++ b/0_跟Docker相关的脚本.md @@ -1,5 +1,3 @@ -# 常用的 Docker 镜像脚本和命令 - ## 安装 DPanel ```shell From de1ef9ba022b146979dbafb306fdbde2ad5d15c9 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:07:56 +0800 Subject: [PATCH 10/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E6=8E=92?= =?UTF-8?q?=E6=9F=A5vLLM=E5=B4=A9=E6=BA=83=E9=97=AE=E9=A2=98.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 排查vLLM崩溃问题.md => 202510_排查vLLM崩溃问题.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 排查vLLM崩溃问题.md => 202510_排查vLLM崩溃问题.md (100%) diff --git a/排查vLLM崩溃问题.md b/202510_排查vLLM崩溃问题.md similarity index 100% rename from 排查vLLM崩溃问题.md rename to 202510_排查vLLM崩溃问题.md From bb3202c2dc96a1f7e7650bc4689a018608d199d9 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:08:17 +0800 Subject: [PATCH 11/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=A4=9A?= =?UTF-8?q?=E6=99=BA=E8=83=BD=E4=BD=93=E5=BC=80=E5=8F=91=E6=8A=80=E6=9C=AF?= =?UTF-8?q?=E6=96=B9=E6=A1=88=E5=AF=B9=E6=AF=94.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 多智能体开发技术方案对比.md => 202510_多智能体开发技术方案对比.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 多智能体开发技术方案对比.md => 202510_多智能体开发技术方案对比.md (100%) diff --git a/多智能体开发技术方案对比.md b/202510_多智能体开发技术方案对比.md similarity index 100% rename from 多智能体开发技术方案对比.md rename to 202510_多智能体开发技术方案对比.md From 3c739bb5f781302c6029381b6dd5398a6026aa40 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:08:45 +0800 Subject: [PATCH 12/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251014.md => 202510_RTX4090笔电操作记录.md | 2 -- 1 file changed, 2 deletions(-) rename 20251014.md => 202510_RTX4090笔电操作记录.md (98%) diff --git a/20251014.md b/202510_RTX4090笔电操作记录.md similarity index 98% rename from 20251014.md rename to 202510_RTX4090笔电操作记录.md index 81b55c6..1004f81 100644 --- a/20251014.md +++ b/202510_RTX4090笔电操作记录.md @@ -1,5 +1,3 @@ -# RTX4090笔电操作记录 - ```shell # 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里 cat /etc/apt/sources.list From 1ac22934f3cee8e496266251082e00f547496b6a Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:09:03 +0800 Subject: [PATCH 13/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 1 - 1 file changed, 1 deletion(-) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index 1004f81..a3a0409 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -58,5 +58,4 @@ vllm serve /home/ss/vllm-py12/qwen3-06b \ --gpu-memory-utilization 0.9 \ --max-model-len 32768 \ --trust-remote-code - ``` \ No newline at end of file From 29da0b0ca8882990cfeefcdb75820a5583d547ba Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 14:23:32 +0800 Subject: [PATCH 14/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E4=BD=BF?= =?UTF-8?q?=E7=94=A8Java=E6=89=A7=E8=A1=8Ctshark=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=E6=8A=93=E5=8C=85.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251017.md => 202510_使用Java执行tshark命令抓包.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 20251017.md => 202510_使用Java执行tshark命令抓包.md (100%) diff --git a/20251017.md b/202510_使用Java执行tshark命令抓包.md similarity index 100% rename from 20251017.md rename to 202510_使用Java执行tshark命令抓包.md From 549ddf9944706364e7a5a37d7d03ee65989dad72 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:25:34 +0800 Subject: [PATCH 15/27] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 121 +++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 202510_定时监控vLLM进程.md diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md new file mode 100644 index 0000000..418c564 --- /dev/null +++ b/202510_定时监控vLLM进程.md @@ -0,0 +1,121 @@ +## 修改vLLM运行gpt-oss-120b的脚本 + +在 nohup 的启动命令末尾追加如下片段,保存 vLLM 的进程 PID 号。 + +``` +echo $! > /hook/gpt-oss-120b.pid +``` + +## 创建一个定时任务 bash 脚本 + +``` +vim /hook/timer_bash.sh +``` + +``` +#!/bin/bash + +# 定义 PID 文件路径 +TARGET_PID_FILE="/hook/gpt-oss-120b.pid" +TSARK_PID_FILE="/hook/tshark_bash.pid" + +# 无限循环,每隔 5 分钟(300 秒)检查一次 +while true; do + # 检查目标 PID 文件是否存在 + if [[ -f "$TARGET_PID_FILE" ]]; then + TARGET_PID=$(cat "$TARGET_PID_FILE" 2>/dev/null) + + # 检查读取到的 PID 是否为数字且进程是否存在 + if [[ "$TARGET_PID" =~ ^[0-9]+$ ]] && ps -p "$TARGET_PID" > /dev/null 2>&1; then + echo "$(date): 目标进程 (PID: $TARGET_PID) 存活,执行相应操作..." + + # 1. 杀掉 tshark_bash 进程 + if [[ -f "$TSARK_PID_FILE" ]]; then + TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null) + if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then + kill -9 "$TSHARK_PID" 2>/dev/null + echo "已强制杀死 tshark_bash 进程 (PID: $TSHARK_PID)" + else + echo "tshark_bash.pid 文件内容无效或为空,跳过 kill 操作" + fi + else + echo "tshark_bash.pid 文件不存在,跳过 kill 操作" + fi + + # 2. 删除 shark.log + if [[ -f "/hook/shark.log" ]]; then + rm -rf "/hook/shark.log" + echo "已删除 /hook/shark.log" + else + echo "/hook/shark.log 文件不存在,跳过删除" + fi + + # 3. 删除 /tmp/*.pcapng + if ls /tmp/*.pcapng >/dev/null 2>&1; then + rm -rf /tmp/*.pcapng + echo "已删除 /tmp/*.pcapng 文件" + else + echo "没有找到 /tmp/*.pcapng 文件,跳过删除" + fi + + # 4. 启动新的 tshark_bash.sh 并记录 PID + echo "正在启动新的 tshark_bash.sh..." + sudo nohup bash /hook/tshark_bash.sh >> /hook/tshark_bash.log 2>&1 & + NEW_TSHARK_PID=$! + echo "$NEW_TSHARK_PID" > "$TSARK_PID_FILE" + echo "已启动 tshark_bash.sh,新 PID: $NEW_TSHARK_PID,已写入 $TSARK_PID_FILE" + + else + echo "$(date): 目标进程 (PID: ${TARGET_PID:-未知}) 不存在或已挂掉,执行清理操作..." + + # 1. 杀掉 tshark_bash 进程 + if [[ -f "$TSARK_PID_FILE" ]]; then + TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null) + if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then + kill -9 "$TSHARK_PID" 2>/dev/null + echo "已强制杀死 tshark_bash 进程 (PID: $TSHARK_PID)" + else + echo "tshark_bash.pid 文件内容无效或为空,跳过 kill 操作" + fi + else + echo "tshark_bash.pid 文件不存在,跳过 kill 操作" + fi + + # 2. 删除 /tmp/*.pcapng + if ls /tmp/*.pcapng >/dev/null 2>&1; then + rm -rf /tmp/*.pcapng + echo "已删除 /tmp/*.pcapng 文件" + else + echo "没有找到 /tmp/*.pcapng 文件,跳过删除" + fi + + echo "目标进程已挂掉,已执行清理操作。" + fi + else + echo "$(date): 目标 PID 文件 $TARGET_PID_FILE 不存在,跳过本次检查。" + fi + + # 等待 5 分钟 (300 秒) + sleep 300 +done +``` + +## 运行定时任务脚本 + +##### 第一次运行先赋权 + +``` +chmod +x vim /hook/timer_bash.sh +``` + +##### 运行脚本 + +``` +sudo nohup /hook/timer_bash.sh > /hook/timer_bash.log 2>&1 & echo $! > /hook/timer_bash.pid +``` + +##### 停止脚本 + +``` +kill $(cat /hook/timer_bash.pid) +``` \ No newline at end of file From c42e1509f5acb7ecf228e7e739615ca2e6001e58 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:28:41 +0800 Subject: [PATCH 16/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index 418c564..476467e 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -100,7 +100,7 @@ while true; do done ``` -## 运行定时任务脚本 +## 定时任务 ##### 第一次运行先赋权 @@ -108,14 +108,28 @@ done chmod +x vim /hook/timer_bash.sh ``` -##### 运行脚本 +##### 运行 ``` sudo nohup /hook/timer_bash.sh > /hook/timer_bash.log 2>&1 & echo $! > /hook/timer_bash.pid ``` -##### 停止脚本 +##### 停止 ``` kill $(cat /hook/timer_bash.pid) +``` + +## 抓包脚本 + +##### 运行 + +``` +sudo nohup bash /hook/tshark_bash.sh >> /hook/tshark_bash.log 2>&1 & echo $! > /hook/tshark_bash.pid +``` + +##### 停止 + +``` +kill -9 $(cat /hook/tshark_bash.pid) ``` \ No newline at end of file From 37c6857458e1db82453e537ba5ceeb560b0d82ee Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:36:45 +0800 Subject: [PATCH 17/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index a3a0409..be43114 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -32,24 +32,14 @@ pip install vllm -i http://mirrors.cloud.tencent.com/pypi/simple --extra-index-u # 安装 modelscope pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -# 拉取 gpt-oss-20b 模型 +# 拉取 gpt-oss-20b 模型,由于显存不足,运行失败了 modelscope download --model openai-mirror/gpt-oss-20b --local_dir /home/ss/vllm-py12/gpt-oss-20b -# 运行 gpt-oss-20b 模型失败,移动端的 RTX4090 只有 16GB 显存,至少需要 16~24GB 显存 -vllm serve \ - /home/ss/vllm-py12/gpt-oss-20b \ - --port 18777 \ - --api-key token_lcfc \ - --served-model-name gpt-oss-20b \ - --gpu-memory-utilization 0.95 \ - --tool-call-parser openai \ - --enable-auto-tool-choice - -# Qwen3-8b 也需要 16~24GB显存,所以下载了 Qwen3-0.6B +# 下载了 Qwen3-0.6B modelscope download --model Qwen/Qwen3-0.6B --local_dir /home/ss/vllm-py12/qwen3-06b -# 运行 Qwen3-8b -vllm serve /home/ss/vllm-py12/qwen3-06b \ +# 运行 Qwen3-0.6B +nohup vllm serve /home/ss/vllm-py12/qwen3-06b \ --host 0.0.0.0 \ --port 8000 \ --served-model-name Qwen3-0.6B \ @@ -57,5 +47,14 @@ vllm serve /home/ss/vllm-py12/qwen3-06b \ --dtype auto \ --gpu-memory-utilization 0.9 \ --max-model-len 32768 \ - --trust-remote-code + --trust-remote-code \ + >> /home/ss/vllm-py12/vllm.log 2>&1 \ + & echo $! > /home/ss/vllm-py12/vllm.pid + +# 安装了抓包工具 tshark 和 ngrep +sudo apt install ngrep +sudo apt-get install tshark + +# 运行了1个定时任务脚本 + ``` \ No newline at end of file From c512c827cce11442ddf63b0339a1ae9e2904e848 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:47:45 +0800 Subject: [PATCH 18/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index be43114..adc91a3 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -1,4 +1,3 @@ -```shell # 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里 cat /etc/apt/sources.list @@ -55,6 +54,13 @@ nohup vllm serve /home/ss/vllm-py12/qwen3-06b \ sudo apt install ngrep sudo apt-get install tshark -# 运行了1个定时任务脚本 +# 通过 java 脚本调用 tshark 提取关键日志 +sudo nohup bash /home/ss/vllm-py12/tshark_bash.sh >> /home/ss/vllm-py12/tshark_bash.log 2>&1 & echo $! > /home/ss/vllm-py12/tshark_bash.pid +# 运行了1个定时任务脚本,清理 tshark 的临时文件并重启 java 脚本 +sudo nohup /home/ss/vllm-py12/timer_bash.sh > /home/ss/vllm-py12/timer_bash.log 2>&1 & echo $! > /home/ss/vllm-py12/timer_bash.pid + +# 杀死上面2个进程的命令 +kill -9 $(cat /home/ss/vllm-py12/timer_bash.log) +kill -9 $(cat /home/ss/vllm-py12/tshark_bash.log) ``` \ No newline at end of file From 6910dce65adf809d1878af31c97a9bd9a3e5b5f0 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:48:15 +0800 Subject: [PATCH 19/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 1 + 1 file changed, 1 insertion(+) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index adc91a3..480424c 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -1,3 +1,4 @@ +``` # 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里 cat /etc/apt/sources.list From be520ea5335631116a48fcd15fa6b58ac97f067b Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 16:48:42 +0800 Subject: [PATCH 20/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index 480424c..497640f 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -62,6 +62,6 @@ sudo nohup bash /home/ss/vllm-py12/tshark_bash.sh >> /home/ss/vllm-py12/tshark_b sudo nohup /home/ss/vllm-py12/timer_bash.sh > /home/ss/vllm-py12/timer_bash.log 2>&1 & echo $! > /home/ss/vllm-py12/timer_bash.pid # 杀死上面2个进程的命令 -kill -9 $(cat /home/ss/vllm-py12/timer_bash.log) -kill -9 $(cat /home/ss/vllm-py12/tshark_bash.log) +sudo kill -9 $(cat /home/ss/vllm-py12/timer_bash.log) +sudo kill -9 $(cat /home/ss/vllm-py12/tshark_bash.log) ``` \ No newline at end of file From 1ce8d10bc1391f656efe3af590742ebbd5ec265a Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:01:15 +0800 Subject: [PATCH 21/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 60 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index 476467e..ff1a69d 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -15,21 +15,33 @@ vim /hook/timer_bash.sh ``` #!/bin/bash -# 定义 PID 文件路径 +# 目标进程的 PID 文件路径 TARGET_PID_FILE="/hook/gpt-oss-120b.pid" + +# tshark_bash 的 PID 文件路径 TSARK_PID_FILE="/hook/tshark_bash.pid" -# 无限循环,每隔 5 分钟(300 秒)检查一次 +# tshark_bash 脚本路径 +TSARK_SCRIPT_PATH="/hook/tshark_bash.sh" + +# tshark_bash 日志文件路径 +TSARK_LOG_FILE="/hook/tshark_bash.log" + +# shark.log 文件路径 +SHARK_LOG_FILE="/hook/shark.log" + +# 临时目录下要删除的 pcapng 文件路径模式 +PCAPNG_FILES_PATTERN="/tmp/*.pcapng" + +# 5分钟检查一次 vllm 进程是否存活 +# 若 vllm 进程存活,清理 tshark 的临时文件,然后重启 tshark 脚本继续抓包 +# 若 vllm 进程已死亡则 kill tshark 的进程 + while true; do - # 检查目标 PID 文件是否存在 if [[ -f "$TARGET_PID_FILE" ]]; then TARGET_PID=$(cat "$TARGET_PID_FILE" 2>/dev/null) - - # 检查读取到的 PID 是否为数字且进程是否存在 if [[ "$TARGET_PID" =~ ^[0-9]+$ ]] && ps -p "$TARGET_PID" > /dev/null 2>&1; then echo "$(date): 目标进程 (PID: $TARGET_PID) 存活,执行相应操作..." - - # 1. 杀掉 tshark_bash 进程 if [[ -f "$TSARK_PID_FILE" ]]; then TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null) if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then @@ -42,25 +54,22 @@ while true; do echo "tshark_bash.pid 文件不存在,跳过 kill 操作" fi - # 2. 删除 shark.log - if [[ -f "/hook/shark.log" ]]; then - rm -rf "/hook/shark.log" - echo "已删除 /hook/shark.log" + if [[ -f "$SHARK_LOG_FILE" ]]; then + rm -rf "$SHARK_LOG_FILE" + echo "已删除 $SHARK_LOG_FILE" else - echo "/hook/shark.log 文件不存在,跳过删除" + echo "$SHARK_LOG_FILE 文件不存在,跳过删除" fi - # 3. 删除 /tmp/*.pcapng - if ls /tmp/*.pcapng >/dev/null 2>&1; then - rm -rf /tmp/*.pcapng - echo "已删除 /tmp/*.pcapng 文件" + if ls $PCAPNG_FILES_PATTERN >/dev/null 2>&1; then + rm -rf $PCAPNG_FILES_PATTERN + echo "已删除 $PCAPNG_FILES_PATTERN 文件" else - echo "没有找到 /tmp/*.pcapng 文件,跳过删除" + echo "没有找到 $PCAPNG_FILES_PATTERN 文件,跳过删除" fi - # 4. 启动新的 tshark_bash.sh 并记录 PID - echo "正在启动新的 tshark_bash.sh..." - sudo nohup bash /hook/tshark_bash.sh >> /hook/tshark_bash.log 2>&1 & + echo "重启 tshark_bash.sh..." + sudo nohup bash "$TSARK_SCRIPT_PATH" >> "$TSARK_LOG_FILE" 2>&1 & NEW_TSHARK_PID=$! echo "$NEW_TSHARK_PID" > "$TSARK_PID_FILE" echo "已启动 tshark_bash.sh,新 PID: $NEW_TSHARK_PID,已写入 $TSARK_PID_FILE" @@ -68,7 +77,6 @@ while true; do else echo "$(date): 目标进程 (PID: ${TARGET_PID:-未知}) 不存在或已挂掉,执行清理操作..." - # 1. 杀掉 tshark_bash 进程 if [[ -f "$TSARK_PID_FILE" ]]; then TSHARK_PID=$(cat "$TSARK_PID_FILE" 2>/dev/null) if [[ "$TSHARK_PID" =~ ^[0-9]+$ ]]; then @@ -81,12 +89,11 @@ while true; do echo "tshark_bash.pid 文件不存在,跳过 kill 操作" fi - # 2. 删除 /tmp/*.pcapng - if ls /tmp/*.pcapng >/dev/null 2>&1; then - rm -rf /tmp/*.pcapng - echo "已删除 /tmp/*.pcapng 文件" + if ls $PCAPNG_FILES_PATTERN >/dev/null 2>&1; then + rm -rf $PCAPNG_FILES_PATTERN + echo "已删除 $PCAPNG_FILES_PATTERN 文件" else - echo "没有找到 /tmp/*.pcapng 文件,跳过删除" + echo "没有找到 $PCAPNG_FILES_PATTERN 文件,跳过删除" fi echo "目标进程已挂掉,已执行清理操作。" @@ -95,7 +102,6 @@ while true; do echo "$(date): 目标 PID 文件 $TARGET_PID_FILE 不存在,跳过本次检查。" fi - # 等待 5 分钟 (300 秒) sleep 300 done ``` From 1ed34b669ab030596d277871563b33675df0cc5e Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:04:42 +0800 Subject: [PATCH 22/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 1 + 1 file changed, 1 insertion(+) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index ff1a69d..1559827 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -104,6 +104,7 @@ while true; do sleep 300 done + ``` ## 定时任务 From e2e248549a26a9849abb77e7c592fe2d995a8022 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:04:58 +0800 Subject: [PATCH 23/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 1 - 1 file changed, 1 deletion(-) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index 1559827..ff1a69d 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -104,7 +104,6 @@ while true; do sleep 300 done - ``` ## 定时任务 From ad569695685220a522a3e644c8aedcb97a656b7d Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:09:55 +0800 Subject: [PATCH 24/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index 497640f..9b38599 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -62,6 +62,6 @@ sudo nohup bash /home/ss/vllm-py12/tshark_bash.sh >> /home/ss/vllm-py12/tshark_b sudo nohup /home/ss/vllm-py12/timer_bash.sh > /home/ss/vllm-py12/timer_bash.log 2>&1 & echo $! > /home/ss/vllm-py12/timer_bash.pid # 杀死上面2个进程的命令 -sudo kill -9 $(cat /home/ss/vllm-py12/timer_bash.log) -sudo kill -9 $(cat /home/ss/vllm-py12/tshark_bash.log) +sudo kill -9 $(cat /home/ss/vllm-py12/timer_bash.pid) +sudo kill -9 $(cat /home/ss/vllm-py12/tshark_bash.pid) ``` \ No newline at end of file From c39e21f56df5e896cab0a7784c63e3dd286b494c Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:11:34 +0800 Subject: [PATCH 25/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5FRTX4090?= =?UTF-8?q?=E7=AC=94=E7=94=B5=E6=93=8D=E4=BD=9C=E8=AE=B0=E5=BD=95.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_RTX4090笔电操作记录.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/202510_RTX4090笔电操作记录.md b/202510_RTX4090笔电操作记录.md index 9b38599..a10d037 100644 --- a/202510_RTX4090笔电操作记录.md +++ b/202510_RTX4090笔电操作记录.md @@ -64,4 +64,7 @@ sudo nohup /home/ss/vllm-py12/timer_bash.sh > /home/ss/vllm-py12/timer_bash.log # 杀死上面2个进程的命令 sudo kill -9 $(cat /home/ss/vllm-py12/timer_bash.pid) sudo kill -9 $(cat /home/ss/vllm-py12/tshark_bash.pid) + +# 清理日志 +cd /home/ss/vllm-py12 && rm -rf timer_bash.log tshark_bash.log shark.log ``` \ No newline at end of file From 22446381db07846372362006e6254250f4cd9bc9 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:21:18 +0800 Subject: [PATCH 26/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index ff1a69d..ced7462 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -1,3 +1,5 @@ +请勿关闭此电脑 + ## 修改vLLM运行gpt-oss-120b的脚本 在 nohup 的启动命令末尾追加如下片段,保存 vLLM 的进程 PID 号。 From f676ceb95161e11b0ea9f2a75c9574a3842da193 Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 17:27:35 +0800 Subject: [PATCH 27/27] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20202510=5F=E5=AE=9A?= =?UTF-8?q?=E6=97=B6=E7=9B=91=E6=8E=A7vLLM=E8=BF=9B=E7=A8=8B.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_定时监控vLLM进程.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/202510_定时监控vLLM进程.md b/202510_定时监控vLLM进程.md index ced7462..ff1a69d 100644 --- a/202510_定时监控vLLM进程.md +++ b/202510_定时监控vLLM进程.md @@ -1,5 +1,3 @@ -请勿关闭此电脑 - ## 修改vLLM运行gpt-oss-120b的脚本 在 nohup 的启动命令末尾追加如下片段,保存 vLLM 的进程 PID 号。