From 8b31619713ef8050e3f36310cd93114e4b2da536 Mon Sep 17 00:00:00 2001 From: 8ga Date: Mon, 27 Oct 2025 19:50:31 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20202510=5F=E6=8E=92?= =?UTF-8?q?=E6=9F=A5vLLM=E5=B4=A9=E6=BA=83=E9=97=AE=E9=A2=98.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 202510_排查vLLM崩溃问题.md | 113 ------------------------------------- 1 file changed, 113 deletions(-) delete mode 100644 202510_排查vLLM崩溃问题.md diff --git a/202510_排查vLLM崩溃问题.md b/202510_排查vLLM崩溃问题.md deleted file mode 100644 index 4b5f86a..0000000 --- a/202510_排查vLLM崩溃问题.md +++ /dev/null @@ -1,113 +0,0 @@ -## 问题原因 - -由于外部调用 vllm 的 OpenAI API 服务时候,传入的请求参数让 vllm 调用了**get_structured_output_key**函数。在该函数里不能被正确处理,抛出了**No valid structured output parameter found**异常,该异常导致了 vllm 的 EngineCore 和 APIServer 进程死亡。 - -## 源码出处 -https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/v1/structured_output/request.py - - -## 问题追踪 - -由于 vllm 没有提供 http 请求参数的日志打印,也没有集成监控 http 请求的三方工具。所以在 Ubuntu 上安装了 tshark 抓包工具。通过 Java 脚本启动 tshark 命令,并将 tshark 抓包到的日志内容写入到磁盘文件。下一次 vllm 崩溃时,根据磁盘文件存储的日志内容分析是什么参数导致了**get_structured_output_key**的异常。 - -> 使用 Java 执行 tshark 是为了抹掉 Linux 和 Windows 的平台差异,不用修改代码和命令即可直接运行。 - -> Java 脚本内容: - -## 崩溃日志 - -```text -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] EngineCore encountered a fatal error. -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] Traceback (most recent call last): -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 701, in run_engine_core -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] engine_core.run_busy_loop() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 728, in run_busy_loop -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] self._process_engine_step() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 754, in _process_engine_step -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] outputs, model_executed = self.step_fn() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 283, in step -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] scheduler_output = self.scheduler.schedule() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 359, in schedule -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] if structured_output_req and structured_output_req.grammar: -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 45, in grammar -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] completed = self._check_grammar_completion() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 33, in _check_grammar_completion -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] self._grammar = self._grammar.result(timeout=0.0001) -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 458, in result -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] return self.__get_result() -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] raise self._exception -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/thread.py", line 58, in run -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] result = self.fn(*self.args, **self.kwargs) -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 128, in _async_create_grammar -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] key = request.structured_output_request.structured_output_key # type: ignore[union-attr] -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/functools.py", line 981, in __get__ -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] val = self.func(instance) -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 58, in structured_output_key -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] return get_structured_output_key(self.sampling_params) -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 86, in get_structured_output_key -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] raise ValueError("No valid structured output parameter found") -(EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] ValueError: No valid structured output parameter found -(EngineCore_DP0 pid=2738693) Process EngineCore_DP0: -(EngineCore_DP0 pid=2738693) Traceback (most recent call last): -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] AsyncLLM output_handler failed. -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] Traceback (most recent call last): -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 439, in output_handler -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] outputs = await engine_core.get_output_async() -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 846, in get_output_async -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] raise self._format_exception(outputs) from None -(APIServer pid=2738423) ERROR 10-10 10:43:10 [async_llm.py:480] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap -(EngineCore_DP0 pid=2738693) self.run() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/multiprocessing/process.py", line 108, in run -(EngineCore_DP0 pid=2738693) self._target(*self._args, **self._kwargs) -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 712, in run_engine_core -(EngineCore_DP0 pid=2738693) raise e -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 701, in run_engine_core -(EngineCore_DP0 pid=2738693) engine_core.run_busy_loop() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 728, in run_busy_loop -(EngineCore_DP0 pid=2738693) self._process_engine_step() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 754, in _process_engine_step -(EngineCore_DP0 pid=2738693) outputs, model_executed = self.step_fn() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 283, in step -(EngineCore_DP0 pid=2738693) scheduler_output = self.scheduler.schedule() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 359, in schedule -(EngineCore_DP0 pid=2738693) if structured_output_req and structured_output_req.grammar: -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 45, in grammar -(EngineCore_DP0 pid=2738693) completed = self._check_grammar_completion() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 33, in _check_grammar_completion -(EngineCore_DP0 pid=2738693) self._grammar = self._grammar.result(timeout=0.0001) -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 458, in result -(EngineCore_DP0 pid=2738693) return self.__get_result() -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result -(EngineCore_DP0 pid=2738693) raise self._exception -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/concurrent/futures/thread.py", line 58, in run -(EngineCore_DP0 pid=2738693) result = self.fn(*self.args, **self.kwargs) -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 128, in _async_create_grammar -(EngineCore_DP0 pid=2738693) key = request.structured_output_request.structured_output_key # type: ignore[union-attr] -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/functools.py", line 981, in __get__ -(EngineCore_DP0 pid=2738693) val = self.func(instance) -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 58, in structured_output_key -(EngineCore_DP0 pid=2738693) return get_structured_output_key(self.sampling_params) -(EngineCore_DP0 pid=2738693) File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 86, in get_structured_output_key -(EngineCore_DP0 pid=2738693) raise ValueError("No valid structured output parameter found") -(EngineCore_DP0 pid=2738693) ValueError: No valid structured output parameter found -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] Error in chat completion stream generator. -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] Traceback (most recent call last): -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/entrypoints/openai/serving_chat.py", line 574, in chat_completion_stream_generator -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] async for res in result_generator: -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 387, in generate -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] out = q.get_nowait() or await q.get() -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/output_processor.py", line 59, in get -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] raise output -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 439, in output_handler -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] outputs = await engine_core.get_output_async() -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] File "/aisoft/conda/env/vllm2/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 846, in get_output_async -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] raise self._format_exception(outputs) from None -(APIServer pid=2738423) ERROR 10-10 10:43:10 [serving_chat.py:1145] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. -[rank0]:[W1010 10:43:10.666166970 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) -(APIServer pid=2738423) INFO: Shutting down -(APIServer pid=2738423) INFO: Waiting for application shutdown. -(APIServer pid=2738423) INFO: Application shutdown complete. -(APIServer pid=2738423) INFO: Finished server process [2738423] -``` \ No newline at end of file