From 253c59fcc442167679e4a0d8a15a19fe619ecfe8 Mon Sep 17 00:00:00 2001
From: liushuang <V-LiuShuang@lcfuturecenter.com>
Date: Sat, 11 Oct 2025 14:45:09 +0800
Subject: [PATCH] add

---
 20251011.md | 1092 ++-------------------------------------------------
 1 file changed, 29 insertions(+), 1063 deletions(-)

diff --git a/20251011.md b/20251011.md
index 03d52c4..bf0de33 100644
--- a/20251011.md
+++ b/20251011.md
@@ -1,4 +1,8 @@
-## 源码追踪
+## 问题原因
+
+在 get_structured_output_key 中处理 structured_outputs 参数抛异常，该异常未被 EngineCore 捕获导致引擎崩溃，APIServer 发现引擎死了，自己也退出了进程。
+
+## 源码的具体实现
 
 ### 函数 get_structured_output_key 实现
 
@@ -10,7 +14,6 @@ https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/v1/structured_ou
 def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey:
     params = sampling_params.structured_outputs
     assert params is not None, "params can't be None."
-    # 参数用的是 structured_outputs 
     if params.json is not None:
         if not isinstance(params.json, str):
             json_str = json.dumps(params.json)
@@ -32,7 +35,6 @@ def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutp
     elif params.structural_tag is not None:
         return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag)
     else:
-        # 不满足上面的条件，就会抛这个Error
         raise ValueError("No valid structured output parameter found")
 ```
 
@@ -42,8 +44,7 @@ https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/v1/structured_ou
 
 ```python
 def get_structured_output_key(
-        sampling_params: SamplingParams) -> StructuredOutputKey:
-    # 参数用的是 guided_decoding    
+        sampling_params: SamplingParams) -> StructuredOutputKey:    
     params = sampling_params.guided_decoding
     assert params is not None, "params can't be None."
     if params.json is not None:
@@ -69,1094 +70,59 @@ def get_structured_output_key(
     else:
         raise ValueError("No valid structured output parameter found")
 ```
+> 2个版本的唯一区别，仅仅是提取`SamplingParams`的属性不一样，其它判断逻辑都是一致的
 
-### SamplingParams 源码
-
-#### 分支：release/v0.11.0
+### SamplingParams
 
 https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/sampling_params.py
 
 ```python
+# 其它参数已省略
 class SamplingParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):  # type: ignore[call-arg]
-    """Sampling parameters for text generation.
-
-    Overall, we follow the sampling parameters from the OpenAI text completion
-    API (https://platform.openai.com/docs/api-reference/completions/create).
-    In addition, we support beam search, which is not supported by OpenAI.
-    """
-
-    n: int = 1
-    """Number of outputs to return for the given prompt request.
-
-    NOTE:
-        `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
-        are generated and streamed cumulatively per request. To see all `n`
-        outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
-        in `SamplingParams`."""
-    best_of: Optional[int] = None
-    """Number of output sequences that are generated from the prompt. From
-    these `best_of` sequences, the top `n` sequences are returned. `best_of`
-    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
-    Warning, this is only supported in V0."""
-    _real_n: Optional[int] = None
-    presence_penalty: float = 0.0
-    """Penalizes new tokens based on whether they appear in the generated text
-    so far. Values > 0 encourage the model to use new tokens, while values < 0
-    encourage the model to repeat tokens."""
-    frequency_penalty: float = 0.0
-    """Penalizes new tokens based on their frequency in the generated text so
-    far. Values > 0 encourage the model to use new tokens, while values < 0
-    encourage the model to repeat tokens."""
-    repetition_penalty: float = 1.0
-    """Penalizes new tokens based on whether they appear in the prompt and the
-    generated text so far. Values > 1 encourage the model to use new tokens,
-    while values < 1 encourage the model to repeat tokens."""
-    temperature: float = 1.0
-    """Controls the randomness of the sampling. Lower values make the model
-    more deterministic, while higher values make the model more random. Zero
-    means greedy sampling."""
-    top_p: float = 1.0
-    """Controls the cumulative probability of the top tokens to consider. Must
-    be in (0, 1]. Set to 1 to consider all tokens."""
-    top_k: int = 0
-    """Controls the number of top tokens to consider. Set to 0 (or -1) to
-    consider all tokens."""
-    min_p: float = 0.0
-    """Represents the minimum probability for a token to be considered,
-    relative to the probability of the most likely token. Must be in [0, 1].
-    Set to 0 to disable this."""
-    seed: Optional[int] = None
-    """Random seed to use for the generation."""
-    stop: Optional[Union[str, list[str]]] = None
-    """String(s) that stop the generation when they are generated. The returned
-    output will not contain the stop strings."""
-    stop_token_ids: Optional[list[int]] = None
-    """Token IDs that stop the generation when they are generated. The returned
-    output will contain the stop tokens unless the stop tokens are special
-    tokens."""
-    ignore_eos: bool = False
-    """Whether to ignore the EOS token and continue generating
-    tokens after the EOS token is generated."""
-    max_tokens: Optional[int] = 16
-    """Maximum number of tokens to generate per output sequence."""
-    min_tokens: int = 0
-    """Minimum number of tokens to generate per output sequence before EOS or
-    `stop_token_ids` can be generated"""
-    logprobs: Optional[int] = None
-    """Number of log probabilities to return per output token. When set to
-    `None`, no probability is returned. If set to a non-`None` value, the
-    result includes the log probabilities of the specified number of most
-    likely tokens, as well as the chosen tokens. Note that the implementation
-    follows the OpenAI API: The API will always return the log probability of
-    the sampled token, so there may be up to `logprobs+1` elements in the
-    response. When set to -1, return all `vocab_size` log probabilities."""
-    prompt_logprobs: Optional[int] = None
-    """Number of log probabilities to return per prompt token.
-    When set to -1, return all `vocab_size` log probabilities."""
-    # NOTE: This parameter is only exposed at the engine level for now.
-    # It is not exposed in the OpenAI API server, as the OpenAI API does
-    # not support returning only a list of token IDs.
-    detokenize: bool = True
-    """Whether to detokenize the output."""
-    skip_special_tokens: bool = True
-    """Whether to skip special tokens in the output."""
-    spaces_between_special_tokens: bool = True
-    """Whether to add spaces between special tokens in the output."""
-    # Optional[list[LogitsProcessor]] type. We use Any here because
-    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
-    logits_processors: Optional[Any] = None
-    """Functions that modify logits based on previously generated tokens, and
-    optionally prompt tokens as a first argument."""
-    include_stop_str_in_output: bool = False
-    """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int,
-                                               msgspec.Meta(ge=-1)]] = None
-    """If set to -1, will use the truncation size supported by the model. If
-    set to an integer k, will use only the last k tokens from the prompt
-    (i.e., left truncation). If set to `None`, truncation is disabled."""
-    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
-
-    # The below fields are not supposed to be used as an input.
-    # They are set in post_init.
-    output_text_buffer_length: int = 0
-    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
-
-    # Fields used to construct logits processors
+    💡初始化逻辑是没问题的，默认值是None
     structured_outputs: Optional[StructuredOutputsParams] = None
-    """Parameters for configuring structured outputs."""
     guided_decoding: Optional[GuidedDecodingParams] = None
-    """Deprecated alias for structured_outputs."""
-    logit_bias: Optional[dict[int, float]] = None
-    """If provided, the engine will construct a logits processor that applies
-    these logit biases."""
-    allowed_token_ids: Optional[list[int]] = None
-    """If provided, the engine will construct a logits processor which only
-    retains scores for the given token ids."""
-    extra_args: Optional[dict[str, Any]] = None
-    """Arbitrary additional args, that can be used by custom sampling
-    implementations, plugins, etc. Not used by any in-tree sampling
-    implementations."""
-
-    # Fields used for bad words
-    bad_words: Optional[list[str]] = None
-    """Words that are not allowed to be generated. More precisely, only the
-    last token of a corresponding token sequence is not allowed when the next
-    generated token can complete the sequence."""
-    _bad_words_token_ids: Optional[list[list[int]]] = None
 
     @staticmethod
     def from_optional(
-        n: Optional[int] = 1,
-        best_of: Optional[int] = None,
-        presence_penalty: Optional[float] = 0.0,
-        frequency_penalty: Optional[float] = 0.0,
-        repetition_penalty: Optional[float] = 1.0,
-        temperature: Optional[float] = 1.0,
-        top_p: Optional[float] = 1.0,
-        top_k: int = 0,
-        min_p: float = 0.0,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, list[str]]] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        bad_words: Optional[list[str]] = None,
-        include_stop_str_in_output: bool = False,
-        ignore_eos: bool = False,
-        max_tokens: Optional[int] = 16,
-        min_tokens: int = 0,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
-        detokenize: bool = True,
-        skip_special_tokens: bool = True,
-        spaces_between_special_tokens: bool = True,
-        logits_processors: Optional[list[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int,
-                                                   msgspec.Meta(
-                                                       ge=-1)]] = None,
-        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
+         💡默认值也是None
         structured_outputs: Optional[StructuredOutputsParams] = None,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
-        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
-        allowed_token_ids: Optional[list[int]] = None,
-        extra_args: Optional[dict[str, Any]] = None,
     ) -> "SamplingParams":
-        if logit_bias is not None:
-            # Convert token_id to integer
-            # Clamp the bias between -100 and 100 per OpenAI API spec
-            logit_bias = {
-                int(token): min(100.0, max(-100.0, bias))
-                for token, bias in logit_bias.items()
-            }
         if guided_decoding is not None:
             warnings.warn(
                 "guided_decoding is deprecated. This will be removed in "
+                💡官方将在 v0.12.0 废弃 guided_decoding 参数，使用 structured_outputs 参数替代，在 v0.11.0 版本做了兼容，
                 "v0.12.0 or v1.0.0, which ever is soonest. Please use "
                 "structured_outputs instead.",
                 DeprecationWarning,
                 stacklevel=2)
             structured_outputs = guided_decoding
             guided_decoding = None
-
         return SamplingParams(
-            n=1 if n is None else n,
-            best_of=best_of,
-            presence_penalty=0.0
-            if presence_penalty is None else presence_penalty,
-            frequency_penalty=0.0
-            if frequency_penalty is None else frequency_penalty,
-            repetition_penalty=1.0
-            if repetition_penalty is None else repetition_penalty,
-            temperature=1.0 if temperature is None else temperature,
-            top_p=1.0 if top_p is None else top_p,
-            top_k=top_k,
-            min_p=min_p,
-            seed=seed,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            bad_words=bad_words,
-            include_stop_str_in_output=include_stop_str_in_output,
-            ignore_eos=ignore_eos,
-            max_tokens=max_tokens,
-            min_tokens=min_tokens,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
-            detokenize=detokenize,
-            skip_special_tokens=skip_special_tokens,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            logits_processors=logits_processors,
-            truncate_prompt_tokens=truncate_prompt_tokens,
-            output_kind=output_kind,
             structured_outputs=structured_outputs,
-            logit_bias=logit_bias,
-            allowed_token_ids=allowed_token_ids,
-            extra_args=extra_args,
         )
-
-    def __post_init__(self) -> None:
-        # how we deal with `best_of``:
-        # if `best_of`` is not set, we default to `n`;
-        # if `best_of`` is set, we set `n`` to `best_of`,
-        # and set `_real_n`` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
-
-        if 0 < self.temperature < _MAX_TEMP:
-            logger.warning(
-                "temperature %s is less than %s, which may cause numerical "
-                "errors nan or inf in tensors. We have maxed it out to %s.",
-                self.temperature, _MAX_TEMP, _MAX_TEMP)
-            self.temperature = max(self.temperature, _MAX_TEMP)
-
-        if self.seed == -1:
-            self.seed = None
-
-        if self.stop is None:
-            self.stop = []
-        elif isinstance(self.stop, str):
-            self.stop = [self.stop]
-
-        if self.stop_token_ids is None:
-            self.stop_token_ids = []
-
-        if self.bad_words is None:
-            self.bad_words = []
-
-        if self.logprobs is True:
-            self.logprobs = 1
-
-        if self.prompt_logprobs is True:
-            self.prompt_logprobs = 1
-
-        # Number of characters to hold back for stop string evaluation
-        # until sequence is finished.
-        if self.stop and not self.include_stop_str_in_output:
-            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
-
-        self._verify_args()
-
-        if self.temperature < _SAMPLING_EPS:
-            # Zero temperature means greedy sampling.
-            self.top_p = 1.0
-            self.top_k = 0
-            self.min_p = 0.0
-            self._verify_greedy_sampling()
-
-        # eos_token_id is added to this by the engine
-        self._all_stop_token_ids.update(self.stop_token_ids)
-
-        if self.guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.structured_outputs = self.guided_decoding
-            self.guided_decoding = None
-
-    def _verify_args(self) -> None:
-        if not isinstance(self.n, int):
-            raise ValueError(f"n must be an int, but is of "
-                             f"type {type(self.n)}")
-        if self.n < 1:
-            raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of is not None:
-            if not isinstance(self.best_of, int):
-                raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}")
-            if self.best_of < 1:
-                raise ValueError(
-                    f"best_of must be at least 1, got {self.best_of}")
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-        if not -2.0 <= self.presence_penalty <= 2.0:
-            raise ValueError("presence_penalty must be in [-2, 2], got "
-                             f"{self.presence_penalty}.")
-        if not -2.0 <= self.frequency_penalty <= 2.0:
-            raise ValueError("frequency_penalty must be in [-2, 2], got "
-                             f"{self.frequency_penalty}.")
-        if self.repetition_penalty <= 0.0:
-            raise ValueError(
-                "repetition_penalty must be greater than zero, got "
-                f"{self.repetition_penalty}.")
-        if self.temperature < 0.0:
-            raise ValueError(
-                f"temperature must be non-negative, got {self.temperature}.")
-        if not 0.0 < self.top_p <= 1.0:
-            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
-        # quietly accept -1 as disabled, but prefer 0
-        if self.top_k < -1:
-            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
-                             f"got {self.top_k}.")
-        if not isinstance(self.top_k, int):
-            raise TypeError(
-                f"top_k must be an integer, got {type(self.top_k).__name__}")
-        if not 0.0 <= self.min_p <= 1.0:
-            raise ValueError("min_p must be in [0, 1], got "
-                             f"{self.min_p}.")
-        if self.max_tokens is not None and self.max_tokens < 1:
-            raise ValueError(
-                f"max_tokens must be at least 1, got {self.max_tokens}.")
-        if self.min_tokens < 0:
-            raise ValueError(f"min_tokens must be greater than or equal to 0, "
-                             f"got {self.min_tokens}.")
-        if self.max_tokens is not None and self.min_tokens > self.max_tokens:
-            raise ValueError(
-                f"min_tokens must be less than or equal to "
-                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if (self.logprobs is not None and self.logprobs != -1
-                and self.logprobs < 0):
-            raise ValueError(
-                f"logprobs must be non-negative or -1, got {self.logprobs}.")
-        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
-                and self.prompt_logprobs < 0):
-            raise ValueError(
-                f"prompt_logprobs must be non-negative or -1, got "
-                f"{self.prompt_logprobs}.")
-        if (self.truncate_prompt_tokens is not None
-                and (self.truncate_prompt_tokens == 0
-                     or self.truncate_prompt_tokens < -1)):
-            raise ValueError(
-                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
-                f"got {self.truncate_prompt_tokens}")
-        assert isinstance(self.stop_token_ids, list)
-        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
-            raise ValueError(f"stop_token_ids must contain only integers, "
-                             f"got {self.stop_token_ids}.")
-        assert isinstance(self.stop, list)
-        if any(not stop_str for stop_str in self.stop):
-            raise ValueError("stop cannot contain an empty string.")
-        if self.stop and not self.detokenize:
-            raise ValueError(
-                "stop strings are only supported when detokenize is True. "
-                "Set detokenize=True to use stop.")
-        if self.best_of != self._real_n and self.output_kind == (
-                RequestOutputKind.DELTA):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
-
-    def _verify_greedy_sampling(self) -> None:
-        if self.n > 1:
-            raise ValueError("n must be 1 when using greedy sampling, "
-                             f"got {self.n}.")
-
-    def update_from_generation_config(
-            self,
-            generation_config: dict[str, Any],
-            model_eos_token_id: Optional[int] = None) -> None:
-        """Update if there are non-default values from generation_config"""
-
-        if model_eos_token_id is not None:
-            # Add the eos token id into the sampling_params to support
-            # min_tokens processing.
-            self._all_stop_token_ids.add(model_eos_token_id)
-
-        # Update eos_token_id for generation
-        if (eos_ids := generation_config.get("eos_token_id")) is not None:
-            # it can be either int or list of int
-            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
-            if model_eos_token_id is not None:
-                # We don't need to include the primary eos_token_id in
-                # stop_token_ids since it's handled separately for stopping
-                # purposes.
-                eos_ids.discard(model_eos_token_id)
-            if eos_ids:
-                self._all_stop_token_ids.update(eos_ids)
-                if not self.ignore_eos:
-                    eos_ids.update(self.stop_token_ids)
-                    self.stop_token_ids = list(eos_ids)
-
-    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
-        if not self.bad_words:
-            return
-        self._bad_words_token_ids = []
-        for bad_word in self.bad_words:
-            # To prohibit words both at the beginning
-            # and in the middle of text
-            # (related to add_prefix_space tokenizer parameter)
-            for add_prefix_space in [False, True]:
-                prefix = " " if add_prefix_space else ""
-                prompt = prefix + bad_word.lstrip()
-                prompt_token_ids = tokenizer.encode(text=prompt,
-                                                    add_special_tokens=False)
-
-                # If no space at the beginning
-                # or if prefix space produces a new word token
-                if (not add_prefix_space) or (
-                        add_prefix_space and prompt_token_ids[0]
-                        != self._bad_words_token_ids[-1][0]
-                        and len(prompt_token_ids) == len(
-                            self._bad_words_token_ids[-1])):
-                    self._bad_words_token_ids.append(prompt_token_ids)
-
-        invalid_token_ids = [
-            token_id for bad_words_token_ids in self._bad_words_token_ids
-            for token_id in bad_words_token_ids
-            if token_id < 0 or token_id > tokenizer.max_token_id
-        ]
-        if len(invalid_token_ids) > 0:
-            raise ValueError(
-                f"The model vocabulary size is {tokenizer.max_token_id+1},"
-                f" but the following tokens"
-                f" were specified as bad: {invalid_token_ids}."
-                f" All token id values should be integers satisfying:"
-                f" 0 <= token_id <= {tokenizer.max_token_id}.")
-
-    @cached_property
-    def sampling_type(self) -> SamplingType:
-        if self.temperature < _SAMPLING_EPS:
-            return SamplingType.GREEDY
-        if self.seed is not None:
-            return SamplingType.RANDOM_SEED
-        return SamplingType.RANDOM
-
-    @property
-    def all_stop_token_ids(self) -> set[int]:
-        return self._all_stop_token_ids
-
-    @property
-    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
-        # For internal use only. Backward compatibility not guaranteed
-        return self._bad_words_token_ids
-
-    def clone(self) -> "SamplingParams":
-        """Deep copy, but maybe not the LogitsProcessor objects.
-
-        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
-        data that is expensive to copy. However, if not copied, the processor
-        needs to support parallel decoding for multiple sequences
-        See https://github.com/vllm-project/vllm/issues/3087
-        """
-
-        logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
-            for lp in self.logits_processors
-        }
-        return copy.deepcopy(self, memo=logit_processor_refs)
-
-    def __repr__(self) -> str:
-        return (
-            f"SamplingParams(n={self.n}, "
-            f"presence_penalty={self.presence_penalty}, "
-            f"frequency_penalty={self.frequency_penalty}, "
-            f"repetition_penalty={self.repetition_penalty}, "
-            f"temperature={self.temperature}, "
-            f"top_p={self.top_p}, "
-            f"top_k={self.top_k}, "
-            f"min_p={self.min_p}, "
-            f"seed={self.seed}, "
-            f"stop={self.stop}, "
-            f"stop_token_ids={self.stop_token_ids}, "
-            f"bad_words={self.bad_words}, "
-            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
-            f"ignore_eos={self.ignore_eos}, "
-            f"max_tokens={self.max_tokens}, "
-            f"min_tokens={self.min_tokens}, "
-            f"logprobs={self.logprobs}, "
-            f"prompt_logprobs={self.prompt_logprobs}, "
-            f"skip_special_tokens={self.skip_special_tokens}, "
-            "spaces_between_special_tokens="
-            f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"structured_outputs={self.structured_outputs}, "
-            f"extra_args={self.extra_args})")
 ```
 
-#### 分支：release/v0.10.2
-
-https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/sampling_params.py
+### StructuredOutputOptions
 
 ```python
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Sampling parameters for text generation."""
-import copy
-from dataclasses import dataclass
-from enum import Enum, IntEnum
-from functools import cached_property
-from typing import Annotated, Any, Optional, Union
-
-import msgspec
-from pydantic import BaseModel
-
-from vllm.logger import init_logger
-from vllm.logits_process import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-logger = init_logger(__name__)
-
-_SAMPLING_EPS = 1e-5
-_MAX_TEMP = 1e-2
-
-
-class SamplingType(IntEnum):
-    GREEDY = 0
-    RANDOM = 1
-    RANDOM_SEED = 2
-
-
-# maybe make msgspec?
-@dataclass
-class GuidedDecodingParams:
-    """One of these fields will be used to build a logit processor."""
-    json: Optional[Union[str, dict]] = None
-    regex: Optional[str] = None
-    choice: Optional[list[str]] = None
-    grammar: Optional[str] = None
-    json_object: Optional[bool] = None
-    """These are other options that can be set"""
-    backend: Optional[str] = None
-    backend_was_auto: bool = False
-    disable_fallback: bool = False
-    disable_any_whitespace: bool = False
-    disable_additional_properties: bool = False
-    whitespace_pattern: Optional[str] = None
-    structural_tag: Optional[str] = None
-
-    @staticmethod
-    def from_optional(
-        json: Optional[Union[dict, BaseModel, str]] = None,
-        regex: Optional[str] = None,
-        choice: Optional[list[str]] = None,
-        grammar: Optional[str] = None,
-        json_object: Optional[bool] = None,
-        backend: Optional[str] = None,
-        whitespace_pattern: Optional[str] = None,
-        structural_tag: Optional[str] = None,
-    ) -> Optional["GuidedDecodingParams"]:
-        if all(arg is None for arg in (json, regex, choice, grammar,
-                                       json_object, structural_tag)):
-            return None
-        # Extract json schemas from pydantic models
-        if isinstance(json, (BaseModel, type(BaseModel))):
-            json = json.model_json_schema()
-        return GuidedDecodingParams(
-            json=json,
-            regex=regex,
-            choice=choice,
-            grammar=grammar,
-            json_object=json_object,
-            backend=backend,
-            whitespace_pattern=whitespace_pattern,
-            structural_tag=structural_tag,
-        )
-
-    def __post_init__(self):
-        """Validate that some fields are mutually exclusive."""
-        guide_count = sum([
-            self.json is not None, self.regex is not None, self.choice
-            is not None, self.grammar is not None, self.json_object is not None
-        ])
-        if guide_count > 1:
-            raise ValueError(
-                "You can only use one kind of guided decoding but multiple are "
-                f"specified: {self.__dict__}")
-
-
-class RequestOutputKind(Enum):
-    # Return entire output so far in every RequestOutput
-    CUMULATIVE = 0
-    # Return only deltas in each RequestOutput
-    DELTA = 1
-    # Do not return intermediate RequestOutput
-    FINAL_ONLY = 2
-
-
-class SamplingParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):  # type: ignore[call-arg]
-    """Sampling parameters for text generation.
-
-    Overall, we follow the sampling parameters from the OpenAI text completion
-    API (https://platform.openai.com/docs/api-reference/completions/create).
-    In addition, we support beam search, which is not supported by OpenAI.
-    """
-
-    n: int = 1
-    """Number of output sequences to return for the given prompt."""
-    best_of: Optional[int] = None
-    """Number of output sequences that are generated from the prompt. From
-    these `best_of` sequences, the top `n` sequences are returned. `best_of`
-    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
-    Warning, this is only supported in V0."""
-    _real_n: Optional[int] = None
-    presence_penalty: float = 0.0
-    """Penalizes new tokens based on whether they appear in the generated text
-    so far. Values > 0 encourage the model to use new tokens, while values < 0
-    encourage the model to repeat tokens."""
-    frequency_penalty: float = 0.0
-    """Penalizes new tokens based on their frequency in the generated text so
-    far. Values > 0 encourage the model to use new tokens, while values < 0
-    encourage the model to repeat tokens."""
-    repetition_penalty: float = 1.0
-    """Penalizes new tokens based on whether they appear in the prompt and the
-    generated text so far. Values > 1 encourage the model to use new tokens,
-    while values < 1 encourage the model to repeat tokens."""
-    temperature: float = 1.0
-    """Controls the randomness of the sampling. Lower values make the model
-    more deterministic, while higher values make the model more random. Zero
-    means greedy sampling."""
-    top_p: float = 1.0
-    """Controls the cumulative probability of the top tokens to consider. Must
-    be in (0, 1]. Set to 1 to consider all tokens."""
-    top_k: int = 0
-    """Controls the number of top tokens to consider. Set to 0 (or -1) to
-    consider all tokens."""
-    min_p: float = 0.0
-    """Represents the minimum probability for a token to be considered,
-    relative to the probability of the most likely token. Must be in [0, 1].
-    Set to 0 to disable this."""
-    seed: Optional[int] = None
-    """Random seed to use for the generation."""
-    stop: Optional[Union[str, list[str]]] = None
-    """String(s) that stop the generation when they are generated. The returned
-    output will not contain the stop strings."""
-    stop_token_ids: Optional[list[int]] = None
-    """Token IDs that stop the generation when they are generated. The returned
-    output will contain the stop tokens unless the stop tokens are special
-    tokens."""
-    ignore_eos: bool = False
-    """Whether to ignore the EOS token and continue generating
-    tokens after the EOS token is generated."""
-    max_tokens: Optional[int] = 16
-    """Maximum number of tokens to generate per output sequence."""
-    min_tokens: int = 0
-    """Minimum number of tokens to generate per output sequence before EOS or
-    `stop_token_ids` can be generated"""
-    logprobs: Optional[int] = None
-    """Number of log probabilities to return per output token. When set to
-    `None`, no probability is returned. If set to a non-`None` value, the
-    result includes the log probabilities of the specified number of most
-    likely tokens, as well as the chosen tokens. Note that the implementation
-    follows the OpenAI API: The API will always return the log probability of
-    the sampled token, so there may be up to `logprobs+1` elements in the
-    response. When set to -1, return all `vocab_size` log probabilities."""
-    prompt_logprobs: Optional[int] = None
-    """Number of log probabilities to return per prompt token.
-    When set to -1, return all `vocab_size` log probabilities."""
-    # NOTE: This parameter is only exposed at the engine level for now.
-    # It is not exposed in the OpenAI API server, as the OpenAI API does
-    # not support returning only a list of token IDs.
-    detokenize: bool = True
-    """Whether to detokenize the output."""
-    skip_special_tokens: bool = True
-    """Whether to skip special tokens in the output."""
-    spaces_between_special_tokens: bool = True
-    """Whether to add spaces between special tokens in the output."""
-    # Optional[list[LogitsProcessor]] type. We use Any here because
-    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
-    logits_processors: Optional[Any] = None
-    """Functions that modify logits based on previously generated tokens, and
-    optionally prompt tokens as a first argument."""
-    include_stop_str_in_output: bool = False
-    """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int,
-                                               msgspec.Meta(ge=-1)]] = None
-    """If set to -1, will use the truncation size supported by the model. If
-    set to an integer k, will use only the last k tokens from the prompt
-    (i.e., left truncation). If set to `None`, truncation is disabled."""
-    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
-
-    # The below fields are not supposed to be used as an input.
-    # They are set in post_init.
-    output_text_buffer_length: int = 0
-    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
-
-    # Fields used to construct logits processors
-    guided_decoding: Optional[GuidedDecodingParams] = None
-    """If provided, the engine will construct a guided decoding logits
-    processor from these parameters."""
-    logit_bias: Optional[dict[int, float]] = None
-    """If provided, the engine will construct a logits processor that applies
-    these logit biases."""
-    allowed_token_ids: Optional[list[int]] = None
-    """If provided, the engine will construct a logits processor which only
-    retains scores for the given token ids."""
-    extra_args: Optional[dict[str, Any]] = None
-    """Arbitrary additional args, that can be used by custom sampling
-    implementations, plugins, etc. Not used by any in-tree sampling
-    implementations."""
-
-    # Fields used for bad words
-    bad_words: Optional[list[str]] = None
-    """Words that are not allowed to be generated. More precisely, only the
-    last token of a corresponding token sequence is not allowed when the next
-    generated token can complete the sequence."""
-    _bad_words_token_ids: Optional[list[list[int]]] = None
-
-    @staticmethod
-    def from_optional(
-        n: Optional[int] = 1,
-        best_of: Optional[int] = None,
-        presence_penalty: Optional[float] = 0.0,
-        frequency_penalty: Optional[float] = 0.0,
-        repetition_penalty: Optional[float] = 1.0,
-        temperature: Optional[float] = 1.0,
-        top_p: Optional[float] = 1.0,
-        top_k: int = 0,
-        min_p: float = 0.0,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, list[str]]] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        bad_words: Optional[list[str]] = None,
-        include_stop_str_in_output: bool = False,
-        ignore_eos: bool = False,
-        max_tokens: Optional[int] = 16,
-        min_tokens: int = 0,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
-        detokenize: bool = True,
-        skip_special_tokens: bool = True,
-        spaces_between_special_tokens: bool = True,
-        logits_processors: Optional[list[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int,
-                                                   msgspec.Meta(
-                                                       ge=-1)]] = None,
-        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
-        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
-        allowed_token_ids: Optional[list[int]] = None,
-        extra_args: Optional[dict[str, Any]] = None,
-    ) -> "SamplingParams":
-        if logit_bias is not None:
-            # Convert token_id to integer
-            # Clamp the bias between -100 and 100 per OpenAI API spec
-            logit_bias = {
-                int(token): min(100.0, max(-100.0, bias))
-                for token, bias in logit_bias.items()
-            }
-
-        return SamplingParams(
-            n=1 if n is None else n,
-            best_of=best_of,
-            presence_penalty=0.0
-            if presence_penalty is None else presence_penalty,
-            frequency_penalty=0.0
-            if frequency_penalty is None else frequency_penalty,
-            repetition_penalty=1.0
-            if repetition_penalty is None else repetition_penalty,
-            temperature=1.0 if temperature is None else temperature,
-            top_p=1.0 if top_p is None else top_p,
-            top_k=top_k,
-            min_p=min_p,
-            seed=seed,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            bad_words=bad_words,
-            include_stop_str_in_output=include_stop_str_in_output,
-            ignore_eos=ignore_eos,
-            max_tokens=max_tokens,
-            min_tokens=min_tokens,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
-            detokenize=detokenize,
-            skip_special_tokens=skip_special_tokens,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            logits_processors=logits_processors,
-            truncate_prompt_tokens=truncate_prompt_tokens,
-            output_kind=output_kind,
-            guided_decoding=guided_decoding,
-            logit_bias=logit_bias,
-            allowed_token_ids=allowed_token_ids,
-            extra_args=extra_args,
-        )
-
-    def __post_init__(self) -> None:
-        # how we deal with `best_of``:
-        # if `best_of`` is not set, we default to `n`;
-        # if `best_of`` is set, we set `n`` to `best_of`,
-        # and set `_real_n`` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
-
-        if 0 < self.temperature < _MAX_TEMP:
-            logger.warning(
-                "temperature %s is less than %s, which may cause numerical "
-                "errors nan or inf in tensors. We have maxed it out to %s.",
-                self.temperature, _MAX_TEMP, _MAX_TEMP)
-            self.temperature = max(self.temperature, _MAX_TEMP)
-
-        if self.seed == -1:
-            self.seed = None
-
-        if self.stop is None:
-            self.stop = []
-        elif isinstance(self.stop, str):
-            self.stop = [self.stop]
-
-        if self.stop_token_ids is None:
-            self.stop_token_ids = []
-
-        if self.bad_words is None:
-            self.bad_words = []
-
-        if self.logprobs is True:
-            self.logprobs = 1
-
-        if self.prompt_logprobs is True:
-            self.prompt_logprobs = 1
-
-        # Number of characters to hold back for stop string evaluation
-        # until sequence is finished.
-        if self.stop and not self.include_stop_str_in_output:
-            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1
-
-        self._verify_args()
-
-        if self.temperature < _SAMPLING_EPS:
-            # Zero temperature means greedy sampling.
-            self.top_p = 1.0
-            self.top_k = 0
-            self.min_p = 0.0
-            self._verify_greedy_sampling()
-
-        # eos_token_id is added to this by the engine
-        self._all_stop_token_ids.update(self.stop_token_ids)
-
-    def _verify_args(self) -> None:
-        if not isinstance(self.n, int):
-            raise ValueError(f"n must be an int, but is of "
-                             f"type {type(self.n)}")
-        if self.n < 1:
-            raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of is not None:
-            if not isinstance(self.best_of, int):
-                raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}")
-            if self.best_of < 1:
-                raise ValueError(
-                    f"best_of must be at least 1, got {self.best_of}")
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-        if not -2.0 <= self.presence_penalty <= 2.0:
-            raise ValueError("presence_penalty must be in [-2, 2], got "
-                             f"{self.presence_penalty}.")
-        if not -2.0 <= self.frequency_penalty <= 2.0:
-            raise ValueError("frequency_penalty must be in [-2, 2], got "
-                             f"{self.frequency_penalty}.")
-        if self.repetition_penalty <= 0.0:
-            raise ValueError(
-                "repetition_penalty must be greater than zero, got "
-                f"{self.repetition_penalty}.")
-        if self.temperature < 0.0:
-            raise ValueError(
-                f"temperature must be non-negative, got {self.temperature}.")
-        if not 0.0 < self.top_p <= 1.0:
-            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
-        # quietly accept -1 as disabled, but prefer 0
-        if self.top_k < -1:
-            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
-                             f"got {self.top_k}.")
-        if not isinstance(self.top_k, int):
-            raise TypeError(
-                f"top_k must be an integer, got {type(self.top_k).__name__}")
-        if not 0.0 <= self.min_p <= 1.0:
-            raise ValueError("min_p must be in [0, 1], got "
-                             f"{self.min_p}.")
-        if self.max_tokens is not None and self.max_tokens < 1:
-            raise ValueError(
-                f"max_tokens must be at least 1, got {self.max_tokens}.")
-        if self.min_tokens < 0:
-            raise ValueError(f"min_tokens must be greater than or equal to 0, "
-                             f"got {self.min_tokens}.")
-        if self.max_tokens is not None and self.min_tokens > self.max_tokens:
-            raise ValueError(
-                f"min_tokens must be less than or equal to "
-                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
-        if (self.logprobs is not None and self.logprobs != -1
-                and self.logprobs < 0):
-            raise ValueError(
-                f"logprobs must be non-negative or -1, got {self.logprobs}.")
-        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
-                and self.prompt_logprobs < 0):
-            raise ValueError(
-                f"prompt_logprobs must be non-negative or -1, got "
-                f"{self.prompt_logprobs}.")
-        if (self.truncate_prompt_tokens is not None
-                and (self.truncate_prompt_tokens == 0
-                     or self.truncate_prompt_tokens < -1)):
-            raise ValueError(
-                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
-                f"got {self.truncate_prompt_tokens}")
-        assert isinstance(self.stop_token_ids, list)
-        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
-            raise ValueError(f"stop_token_ids must contain only integers, "
-                             f"got {self.stop_token_ids}.")
-        assert isinstance(self.stop, list)
-        if any(not stop_str for stop_str in self.stop):
-            raise ValueError("stop cannot contain an empty string.")
-        if self.stop and not self.detokenize:
-            raise ValueError(
-                "stop strings are only supported when detokenize is True. "
-                "Set detokenize=True to use stop.")
-        if self.best_of != self._real_n and self.output_kind == (
-                RequestOutputKind.DELTA):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
-
-    def _verify_greedy_sampling(self) -> None:
-        if self.n > 1:
-            raise ValueError("n must be 1 when using greedy sampling, "
-                             f"got {self.n}.")
-
-    def update_from_generation_config(
-            self,
-            generation_config: dict[str, Any],
-            model_eos_token_id: Optional[int] = None) -> None:
-        """Update if there are non-default values from generation_config"""
-
-        if model_eos_token_id is not None:
-            # Add the eos token id into the sampling_params to support
-            # min_tokens processing.
-            self._all_stop_token_ids.add(model_eos_token_id)
-
-        # Update eos_token_id for generation
-        if (eos_ids := generation_config.get("eos_token_id")) is not None:
-            # it can be either int or list of int
-            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
-            if model_eos_token_id is not None:
-                # We don't need to include the primary eos_token_id in
-                # stop_token_ids since it's handled separately for stopping
-                # purposes.
-                eos_ids.discard(model_eos_token_id)
-            if eos_ids:
-                self._all_stop_token_ids.update(eos_ids)
-                if not self.ignore_eos:
-                    eos_ids.update(self.stop_token_ids)
-                    self.stop_token_ids = list(eos_ids)
-
-    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
-        if not self.bad_words:
-            return
-        self._bad_words_token_ids = []
-        for bad_word in self.bad_words:
-            # To prohibit words both at the beginning
-            # and in the middle of text
-            # (related to add_prefix_space tokenizer parameter)
-            for add_prefix_space in [False, True]:
-                prefix = " " if add_prefix_space else ""
-                prompt = prefix + bad_word.lstrip()
-                prompt_token_ids = tokenizer.encode(text=prompt,
-                                                    add_special_tokens=False)
-
-                # If no space at the beginning
-                # or if prefix space produces a new word token
-                if (not add_prefix_space) or (
-                        add_prefix_space and prompt_token_ids[0]
-                        != self._bad_words_token_ids[-1][0]
-                        and len(prompt_token_ids) == len(
-                            self._bad_words_token_ids[-1])):
-                    self._bad_words_token_ids.append(prompt_token_ids)
-
-        invalid_token_ids = [
-            token_id for bad_words_token_ids in self._bad_words_token_ids
-            for token_id in bad_words_token_ids
-            if token_id < 0 or token_id > tokenizer.max_token_id
-        ]
-        if len(invalid_token_ids) > 0:
-            raise ValueError(
-                f"The model vocabulary size is {tokenizer.max_token_id+1},"
-                f" but the following tokens"
-                f" were specified as bad: {invalid_token_ids}."
-                f" All token id values should be integers satisfying:"
-                f" 0 <= token_id <= {tokenizer.max_token_id}.")
-
-    @cached_property
-    def sampling_type(self) -> SamplingType:
-        if self.temperature < _SAMPLING_EPS:
-            return SamplingType.GREEDY
-        if self.seed is not None:
-            return SamplingType.RANDOM_SEED
-        return SamplingType.RANDOM
-
-    @property
-    def all_stop_token_ids(self) -> set[int]:
-        return self._all_stop_token_ids
-
-    @property
-    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
-        # For internal use only. Backward compatibility not guaranteed
-        return self._bad_words_token_ids
-
-    def clone(self) -> "SamplingParams":
-        """Deep copy, but maybe not the LogitsProcessor objects.
-
-        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
-        data that is expensive to copy. However, if not copied, the processor
-        needs to support parallel decoding for multiple sequences
-        See https://github.com/vllm-project/vllm/issues/3087
-        """
-
-        logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
-            for lp in self.logits_processors
-        }
-        return copy.deepcopy(self, memo=logit_processor_refs)
-
-    def __repr__(self) -> str:
-        return (
-            f"SamplingParams(n={self.n}, "
-            f"presence_penalty={self.presence_penalty}, "
-            f"frequency_penalty={self.frequency_penalty}, "
-            f"repetition_penalty={self.repetition_penalty}, "
-            f"temperature={self.temperature}, "
-            f"top_p={self.top_p}, "
-            f"top_k={self.top_k}, "
-            f"min_p={self.min_p}, "
-            f"seed={self.seed}, "
-            f"stop={self.stop}, "
-            f"stop_token_ids={self.stop_token_ids}, "
-            f"bad_words={self.bad_words}, "
-            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
-            f"ignore_eos={self.ignore_eos}, "
-            f"max_tokens={self.max_tokens}, "
-            f"min_tokens={self.min_tokens}, "
-            f"logprobs={self.logprobs}, "
-            f"prompt_logprobs={self.prompt_logprobs}, "
-            f"skip_special_tokens={self.skip_special_tokens}, "
-            "spaces_between_special_tokens="
-            f"{self.spaces_between_special_tokens}, "
-            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"guided_decoding={self.guided_decoding}, "
-            f"extra_args={self.extra_args})")
-
-
-class BeamSearchParams(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        # required for @cached_property.
-        dict=True):  # type: ignore[call-arg]
-    """Beam search parameters for text generation."""
-    beam_width: int
-    max_tokens: int
-    ignore_eos: bool = False
-    temperature: float = 0.0
-    length_penalty: float = 1.0
-    include_stop_str_in_output: bool = False
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+    STRUCTURAL_TAG = enum.auto()
 ```
+> 💡只支持这6种类型，每个类型都对应 structured_outputs 下面的一个不同的参数。
+
+## 推断
+
+客户端上报了 structured_outputs 参数，但是一个空对象，或者 structured_outputs 参数下面不包含这6种属性，引起了异常。
+
+## 复现
+
+
 
 ## 崩溃日志片段