From c53844f93e03c4b59ec95578b292b5e4953b9f75 Mon Sep 17 00:00:00 2001 From: 8ga Date: Sat, 11 Oct 2025 13:52:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2020251011.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251011.md | 605 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 565 insertions(+), 40 deletions(-) diff --git a/20251011.md b/20251011.md index 99e752d..7c64943 100644 --- a/20251011.md +++ b/20251011.md @@ -1,52 +1,577 @@ -## 崩溃原因 +## 源码追踪 -- 一个请求被发送到 vLLM 服务,该请求包含了结构化输出的意图 -- vLLM 在处理请求时,进入 structured_output_request.structured_output_key 的逻辑,最终调用 get_structured_output_key(sampling_params)函数。 -- 该函数遍历 sampling_params 中的各种参数(如 grammar, json_schema, regex 等),但没有找到任何一个有效的结构化输出参数。 -- 因此抛出异常:raise ValueError("No valid structured output parameter found") -- 这个 ValueError 在后台线程中抛出,但未被捕获,导致 EngineCore 进程(PID 2738693)崩溃 -- 主 API 服务(APIServer)检测到 EngineCore 崩溃,抛出 EngineDeadError,最终整个服务终止。 +### get_structured_output_key 函数实现 -## 解决方案一 +#### 分支:release/v0.11.0 -确保请求进入vllm的时候,提供有效的结构化输出参数。例如: - -```json -{ - "prompt": "生成一个用户信息", - "structured_output": { - "type": "json", - "schema": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"} - }, - "required": ["name", "age"] - } - } -} +```python +# https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/v1/structured_output/request.py +def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: + params = sampling_params.structured_outputs + assert params is not None, "params can't be None." + # structured_outputs 不满足以下任何一个条件 + if params.json is not None: + if not isinstance(params.json, str): + json_str = json.dumps(params.json) + else: + json_str = params.json + return (StructuredOutputOptions.JSON, json_str) + elif params.json_object: + return (StructuredOutputOptions.JSON_OBJECT, "") + elif params.regex is not None: + return (StructuredOutputOptions.REGEX, params.regex) + elif params.choice is not None: + if not isinstance(params.choice, str): + json_str = json.dumps(params.choice) + else: + json_str = params.choice + return (StructuredOutputOptions.CHOICE, json_str) + elif params.grammar is not None: + return (StructuredOutputOptions.GRAMMAR, params.grammar) + elif params.structural_tag is not None: + return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) + else: + # 最终抛出了这个Error + raise ValueError("No valid structured output parameter found") ``` -##### 错误示例 +#### 分支:release/v0.10.2 -```json -{ - "prompt": "生成一个JSON", - "structured_output": {} -} +```python +# https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/v1/structured_output/request.py +def get_structured_output_key( + sampling_params: SamplingParams) -> StructuredOutputKey: + params = sampling_params.guided_decoding + assert params is not None, "params can't be None." + if params.json is not None: + if not isinstance(params.json, str): + json_str = json.dumps(params.json) + else: + json_str = params.json + return (StructuredOutputOptions.JSON, json_str) + elif params.json_object: + return (StructuredOutputOptions.JSON_OBJECT, "") + elif params.regex is not None: + return (StructuredOutputOptions.REGEX, params.regex) + elif params.choice is not None: + if not isinstance(params.choice, str): + json_str = json.dumps(params.choice) + else: + json_str = params.choice + return (StructuredOutputOptions.CHOICE, json_str) + elif params.grammar is not None: + return (StructuredOutputOptions.GRAMMAR, params.grammar) + elif params.structural_tag is not None: + return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) + else: + raise ValueError("No valid structured output parameter found") ``` -```json -{ - "prompt": "生成一个JSON", - "json_schema": null -} +### SamplingParams 源码 + +#### 分支:release/v0.11.0 + +```python + +```python +# https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/sampling_params.py +class SamplingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] + """Sampling parameters for text generation. + + Overall, we follow the sampling parameters from the OpenAI text completion + API (https://platform.openai.com/docs/api-reference/completions/create). + In addition, we support beam search, which is not supported by OpenAI. + """ + + n: int = 1 + """Number of outputs to return for the given prompt request. + + NOTE: + `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs + are generated and streamed cumulatively per request. To see all `n` + outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY` + in `SamplingParams`.""" + best_of: Optional[int] = None + """Number of output sequences that are generated from the prompt. From + these `best_of` sequences, the top `n` sequences are returned. `best_of` + must be greater than or equal to `n`. By default, `best_of` is set to `n`. + Warning, this is only supported in V0.""" + _real_n: Optional[int] = None + presence_penalty: float = 0.0 + """Penalizes new tokens based on whether they appear in the generated text + so far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" + frequency_penalty: float = 0.0 + """Penalizes new tokens based on their frequency in the generated text so + far. Values > 0 encourage the model to use new tokens, while values < 0 + encourage the model to repeat tokens.""" + repetition_penalty: float = 1.0 + """Penalizes new tokens based on whether they appear in the prompt and the + generated text so far. Values > 1 encourage the model to use new tokens, + while values < 1 encourage the model to repeat tokens.""" + temperature: float = 1.0 + """Controls the randomness of the sampling. Lower values make the model + more deterministic, while higher values make the model more random. Zero + means greedy sampling.""" + top_p: float = 1.0 + """Controls the cumulative probability of the top tokens to consider. Must + be in (0, 1]. Set to 1 to consider all tokens.""" + top_k: int = 0 + """Controls the number of top tokens to consider. Set to 0 (or -1) to + consider all tokens.""" + min_p: float = 0.0 + """Represents the minimum probability for a token to be considered, + relative to the probability of the most likely token. Must be in [0, 1]. + Set to 0 to disable this.""" + seed: Optional[int] = None + """Random seed to use for the generation.""" + stop: Optional[Union[str, list[str]]] = None + """String(s) that stop the generation when they are generated. The returned + output will not contain the stop strings.""" + stop_token_ids: Optional[list[int]] = None + """Token IDs that stop the generation when they are generated. The returned + output will contain the stop tokens unless the stop tokens are special + tokens.""" + ignore_eos: bool = False + """Whether to ignore the EOS token and continue generating + tokens after the EOS token is generated.""" + max_tokens: Optional[int] = 16 + """Maximum number of tokens to generate per output sequence.""" + min_tokens: int = 0 + """Minimum number of tokens to generate per output sequence before EOS or + `stop_token_ids` can be generated""" + logprobs: Optional[int] = None + """Number of log probabilities to return per output token. When set to + `None`, no probability is returned. If set to a non-`None` value, the + result includes the log probabilities of the specified number of most + likely tokens, as well as the chosen tokens. Note that the implementation + follows the OpenAI API: The API will always return the log probability of + the sampled token, so there may be up to `logprobs+1` elements in the + response. When set to -1, return all `vocab_size` log probabilities.""" + prompt_logprobs: Optional[int] = None + """Number of log probabilities to return per prompt token. + When set to -1, return all `vocab_size` log probabilities.""" + # NOTE: This parameter is only exposed at the engine level for now. + # It is not exposed in the OpenAI API server, as the OpenAI API does + # not support returning only a list of token IDs. + detokenize: bool = True + """Whether to detokenize the output.""" + skip_special_tokens: bool = True + """Whether to skip special tokens in the output.""" + spaces_between_special_tokens: bool = True + """Whether to add spaces between special tokens in the output.""" + # Optional[list[LogitsProcessor]] type. We use Any here because + # Optional[list[LogitsProcessor]] type is not supported by msgspec. + logits_processors: Optional[Any] = None + """Functions that modify logits based on previously generated tokens, and + optionally prompt tokens as a first argument.""" + include_stop_str_in_output: bool = False + """Whether to include the stop strings in output text.""" + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta(ge=-1)]] = None + """If set to -1, will use the truncation size supported by the model. If + set to an integer k, will use only the last k tokens from the prompt + (i.e., left truncation). If set to `None`, truncation is disabled.""" + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE + + # The below fields are not supposed to be used as an input. + # They are set in post_init. + output_text_buffer_length: int = 0 + _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) + + # Fields used to construct logits processors + structured_outputs: Optional[StructuredOutputsParams] = None + """Parameters for configuring structured outputs.""" + guided_decoding: Optional[GuidedDecodingParams] = None + """Deprecated alias for structured_outputs.""" + logit_bias: Optional[dict[int, float]] = None + """If provided, the engine will construct a logits processor that applies + these logit biases.""" + allowed_token_ids: Optional[list[int]] = None + """If provided, the engine will construct a logits processor which only + retains scores for the given token ids.""" + extra_args: Optional[dict[str, Any]] = None + """Arbitrary additional args, that can be used by custom sampling + implementations, plugins, etc. Not used by any in-tree sampling + implementations.""" + + # Fields used for bad words + bad_words: Optional[list[str]] = None + """Words that are not allowed to be generated. More precisely, only the + last token of a corresponding token sequence is not allowed when the next + generated token can complete the sequence.""" + _bad_words_token_ids: Optional[list[list[int]]] = None + + @staticmethod + def from_optional( + n: Optional[int] = 1, + best_of: Optional[int] = None, + presence_penalty: Optional[float] = 0.0, + frequency_penalty: Optional[float] = 0.0, + repetition_penalty: Optional[float] = 1.0, + temperature: Optional[float] = 1.0, + top_p: Optional[float] = 1.0, + top_k: int = 0, + min_p: float = 0.0, + seed: Optional[int] = None, + stop: Optional[Union[str, list[str]]] = None, + stop_token_ids: Optional[list[int]] = None, + bad_words: Optional[list[str]] = None, + include_stop_str_in_output: bool = False, + ignore_eos: bool = False, + max_tokens: Optional[int] = 16, + min_tokens: int = 0, + logprobs: Optional[int] = None, + prompt_logprobs: Optional[int] = None, + detokenize: bool = True, + skip_special_tokens: bool = True, + spaces_between_special_tokens: bool = True, + logits_processors: Optional[list[LogitsProcessor]] = None, + truncate_prompt_tokens: Optional[Annotated[int, + msgspec.Meta( + ge=-1)]] = None, + output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, + structured_outputs: Optional[StructuredOutputsParams] = None, + guided_decoding: Optional[GuidedDecodingParams] = None, + logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, + allowed_token_ids: Optional[list[int]] = None, + extra_args: Optional[dict[str, Any]] = None, + ) -> "SamplingParams": + if logit_bias is not None: + # Convert token_id to integer + # Clamp the bias between -100 and 100 per OpenAI API spec + logit_bias = { + int(token): min(100.0, max(-100.0, bias)) + for token, bias in logit_bias.items() + } + if guided_decoding is not None: + warnings.warn( + "guided_decoding is deprecated. This will be removed in " + "v0.12.0 or v1.0.0, which ever is soonest. Please use " + "structured_outputs instead.", + DeprecationWarning, + stacklevel=2) + structured_outputs = guided_decoding + guided_decoding = None + + return SamplingParams( + n=1 if n is None else n, + best_of=best_of, + presence_penalty=0.0 + if presence_penalty is None else presence_penalty, + frequency_penalty=0.0 + if frequency_penalty is None else frequency_penalty, + repetition_penalty=1.0 + if repetition_penalty is None else repetition_penalty, + temperature=1.0 if temperature is None else temperature, + top_p=1.0 if top_p is None else top_p, + top_k=top_k, + min_p=min_p, + seed=seed, + stop=stop, + stop_token_ids=stop_token_ids, + bad_words=bad_words, + include_stop_str_in_output=include_stop_str_in_output, + ignore_eos=ignore_eos, + max_tokens=max_tokens, + min_tokens=min_tokens, + logprobs=logprobs, + prompt_logprobs=prompt_logprobs, + detokenize=detokenize, + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + logits_processors=logits_processors, + truncate_prompt_tokens=truncate_prompt_tokens, + output_kind=output_kind, + structured_outputs=structured_outputs, + logit_bias=logit_bias, + allowed_token_ids=allowed_token_ids, + extra_args=extra_args, + ) + + def __post_init__(self) -> None: + # how we deal with `best_of``: + # if `best_of`` is not set, we default to `n`; + # if `best_of`` is set, we set `n`` to `best_of`, + # and set `_real_n`` to the original `n`. + # when we return the result, we will check + # if we need to return `n` or `_real_n` results + if self.best_of: + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not self._real_n: + self._real_n = self.n + self.n = self.best_of + + if 0 < self.temperature < _MAX_TEMP: + logger.warning( + "temperature %s is less than %s, which may cause numerical " + "errors nan or inf in tensors. We have maxed it out to %s.", + self.temperature, _MAX_TEMP, _MAX_TEMP) + self.temperature = max(self.temperature, _MAX_TEMP) + + if self.seed == -1: + self.seed = None + + if self.stop is None: + self.stop = [] + elif isinstance(self.stop, str): + self.stop = [self.stop] + + if self.stop_token_ids is None: + self.stop_token_ids = [] + + if self.bad_words is None: + self.bad_words = [] + + if self.logprobs is True: + self.logprobs = 1 + + if self.prompt_logprobs is True: + self.prompt_logprobs = 1 + + # Number of characters to hold back for stop string evaluation + # until sequence is finished. + if self.stop and not self.include_stop_str_in_output: + self.output_text_buffer_length = max(len(s) for s in self.stop) - 1 + + self._verify_args() + + if self.temperature < _SAMPLING_EPS: + # Zero temperature means greedy sampling. + self.top_p = 1.0 + self.top_k = 0 + self.min_p = 0.0 + self._verify_greedy_sampling() + + # eos_token_id is added to this by the engine + self._all_stop_token_ids.update(self.stop_token_ids) + + if self.guided_decoding is not None: + warnings.warn( + "guided_decoding is deprecated. This will be removed in " + "v0.12.0 or v1.0.0, which ever is soonest. Please use " + "structured_outputs instead.", + DeprecationWarning, + stacklevel=2) + self.structured_outputs = self.guided_decoding + self.guided_decoding = None + + def _verify_args(self) -> None: + if not isinstance(self.n, int): + raise ValueError(f"n must be an int, but is of " + f"type {type(self.n)}") + if self.n < 1: + raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of is not None: + if not isinstance(self.best_of, int): + raise ValueError( + f"best_of must be an integer, got {type(self.best_of)}") + if self.best_of < 1: + raise ValueError( + f"best_of must be at least 1, got {self.best_of}") + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") + if not -2.0 <= self.presence_penalty <= 2.0: + raise ValueError("presence_penalty must be in [-2, 2], got " + f"{self.presence_penalty}.") + if not -2.0 <= self.frequency_penalty <= 2.0: + raise ValueError("frequency_penalty must be in [-2, 2], got " + f"{self.frequency_penalty}.") + if self.repetition_penalty <= 0.0: + raise ValueError( + "repetition_penalty must be greater than zero, got " + f"{self.repetition_penalty}.") + if self.temperature < 0.0: + raise ValueError( + f"temperature must be non-negative, got {self.temperature}.") + if not 0.0 < self.top_p <= 1.0: + raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") + # quietly accept -1 as disabled, but prefer 0 + if self.top_k < -1: + raise ValueError(f"top_k must be 0 (disable), or at least 1, " + f"got {self.top_k}.") + if not isinstance(self.top_k, int): + raise TypeError( + f"top_k must be an integer, got {type(self.top_k).__name__}") + if not 0.0 <= self.min_p <= 1.0: + raise ValueError("min_p must be in [0, 1], got " + f"{self.min_p}.") + if self.max_tokens is not None and self.max_tokens < 1: + raise ValueError( + f"max_tokens must be at least 1, got {self.max_tokens}.") + if self.min_tokens < 0: + raise ValueError(f"min_tokens must be greater than or equal to 0, " + f"got {self.min_tokens}.") + if self.max_tokens is not None and self.min_tokens > self.max_tokens: + raise ValueError( + f"min_tokens must be less than or equal to " + f"max_tokens={self.max_tokens}, got {self.min_tokens}.") + if (self.logprobs is not None and self.logprobs != -1 + and self.logprobs < 0): + raise ValueError( + f"logprobs must be non-negative or -1, got {self.logprobs}.") + if (self.prompt_logprobs is not None and self.prompt_logprobs != -1 + and self.prompt_logprobs < 0): + raise ValueError( + f"prompt_logprobs must be non-negative or -1, got " + f"{self.prompt_logprobs}.") + if (self.truncate_prompt_tokens is not None + and (self.truncate_prompt_tokens == 0 + or self.truncate_prompt_tokens < -1)): + raise ValueError( + f"truncate_prompt_tokens must be an integer >= 1 or -1, " + f"got {self.truncate_prompt_tokens}") + assert isinstance(self.stop_token_ids, list) + if not all(isinstance(st_id, int) for st_id in self.stop_token_ids): + raise ValueError(f"stop_token_ids must contain only integers, " + f"got {self.stop_token_ids}.") + assert isinstance(self.stop, list) + if any(not stop_str for stop_str in self.stop): + raise ValueError("stop cannot contain an empty string.") + if self.stop and not self.detokenize: + raise ValueError( + "stop strings are only supported when detokenize is True. " + "Set detokenize=True to use stop.") + if self.best_of != self._real_n and self.output_kind == ( + RequestOutputKind.DELTA): + raise ValueError("best_of must equal n to use output_kind=DELTA") + + def _verify_greedy_sampling(self) -> None: + if self.n > 1: + raise ValueError("n must be 1 when using greedy sampling, " + f"got {self.n}.") + + def update_from_generation_config( + self, + generation_config: dict[str, Any], + model_eos_token_id: Optional[int] = None) -> None: + """Update if there are non-default values from generation_config""" + + if model_eos_token_id is not None: + # Add the eos token id into the sampling_params to support + # min_tokens processing. + self._all_stop_token_ids.add(model_eos_token_id) + + # Update eos_token_id for generation + if (eos_ids := generation_config.get("eos_token_id")) is not None: + # it can be either int or list of int + eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids) + if model_eos_token_id is not None: + # We don't need to include the primary eos_token_id in + # stop_token_ids since it's handled separately for stopping + # purposes. + eos_ids.discard(model_eos_token_id) + if eos_ids: + self._all_stop_token_ids.update(eos_ids) + if not self.ignore_eos: + eos_ids.update(self.stop_token_ids) + self.stop_token_ids = list(eos_ids) + + def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None: + if not self.bad_words: + return + self._bad_words_token_ids = [] + for bad_word in self.bad_words: + # To prohibit words both at the beginning + # and in the middle of text + # (related to add_prefix_space tokenizer parameter) + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + prompt = prefix + bad_word.lstrip() + prompt_token_ids = tokenizer.encode(text=prompt, + add_special_tokens=False) + + # If no space at the beginning + # or if prefix space produces a new word token + if (not add_prefix_space) or ( + add_prefix_space and prompt_token_ids[0] + != self._bad_words_token_ids[-1][0] + and len(prompt_token_ids) == len( + self._bad_words_token_ids[-1])): + self._bad_words_token_ids.append(prompt_token_ids) + + invalid_token_ids = [ + token_id for bad_words_token_ids in self._bad_words_token_ids + for token_id in bad_words_token_ids + if token_id < 0 or token_id > tokenizer.max_token_id + ] + if len(invalid_token_ids) > 0: + raise ValueError( + f"The model vocabulary size is {tokenizer.max_token_id+1}," + f" but the following tokens" + f" were specified as bad: {invalid_token_ids}." + f" All token id values should be integers satisfying:" + f" 0 <= token_id <= {tokenizer.max_token_id}.") + + @cached_property + def sampling_type(self) -> SamplingType: + if self.temperature < _SAMPLING_EPS: + return SamplingType.GREEDY + if self.seed is not None: + return SamplingType.RANDOM_SEED + return SamplingType.RANDOM + + @property + def all_stop_token_ids(self) -> set[int]: + return self._all_stop_token_ids + + @property + def bad_words_token_ids(self) -> Optional[list[list[int]]]: + # For internal use only. Backward compatibility not guaranteed + return self._bad_words_token_ids + + def clone(self) -> "SamplingParams": + """Deep copy, but maybe not the LogitsProcessor objects. + + LogitsProcessor objects may contain an arbitrary, nontrivial amount of + data that is expensive to copy. However, if not copied, the processor + needs to support parallel decoding for multiple sequences + See https://github.com/vllm-project/vllm/issues/3087 + """ + + logit_processor_refs = None if self.logits_processors is None else { + id(lp): lp.clone() if hasattr(lp, 'clone') else lp + for lp in self.logits_processors + } + return copy.deepcopy(self, memo=logit_processor_refs) + + def __repr__(self) -> str: + return ( + f"SamplingParams(n={self.n}, " + f"presence_penalty={self.presence_penalty}, " + f"frequency_penalty={self.frequency_penalty}, " + f"repetition_penalty={self.repetition_penalty}, " + f"temperature={self.temperature}, " + f"top_p={self.top_p}, " + f"top_k={self.top_k}, " + f"min_p={self.min_p}, " + f"seed={self.seed}, " + f"stop={self.stop}, " + f"stop_token_ids={self.stop_token_ids}, " + f"bad_words={self.bad_words}, " + f"include_stop_str_in_output={self.include_stop_str_in_output}, " + f"ignore_eos={self.ignore_eos}, " + f"max_tokens={self.max_tokens}, " + f"min_tokens={self.min_tokens}, " + f"logprobs={self.logprobs}, " + f"prompt_logprobs={self.prompt_logprobs}, " + f"skip_special_tokens={self.skip_special_tokens}, " + "spaces_between_special_tokens=" + f"{self.spaces_between_special_tokens}, " + f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " + f"structured_outputs={self.structured_outputs}, " + f"extra_args={self.extra_args})") +``` ``` - -## 解决方案二 - -升级vllm的版本到最新稳定版,官方已做出大量优化和改进。 ## 崩溃日志片段