# RTX4090笔电操作记录 ```shell # 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里 cat /etc/apt/sources.list # 安装 openssh 端口号是默认的 22 没有修改 sudo apt install openssh-server -y sudo systemctl enable ssh sudo systemctl start ssh # 安装 NVDIA 显卡驱动和 wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get -y install cuda-toolkit-12-8 sudo apt-get install -y cuda-drivers nvidia-smi # 安装 nvidia-cuda-toolkit apt install nvidia-cuda-toolkit nvcc -V # 创建了一个新的目录,用于存储 vllm 使用的模型或其他文件 mkdir /home/ss/vllm-py12 && cd /home/ss/vllm-py12 # 用 conda 建了个新环境,以下 pip install 都是在该环境执行的 conda create -n vllm-py12 python=3.12 -y conda activate vllm-py12 # 安装 vllm pip install vllm -i http://mirrors.cloud.tencent.com/pypi/simple --extra-index-url https://download.pytorch.org/whl/cu128 # 安装 modelscope pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com # 拉取 gpt-oss-20b 模型 modelscope download --model openai-mirror/gpt-oss-20b --local_dir /home/ss/vllm-py12/gpt-oss-20b # 运行 gpt-oss-20b 模型失败,移动端的 RTX4090 只有 16GB 显存,至少需要 16~24GB 显存 vllm serve \ /home/ss/vllm-py12/gpt-oss-20b \ --port 18777 \ --api-key token_lcfc \ --served-model-name gpt-oss-20b \ --gpu-memory-utilization 0.95 \ --tool-call-parser openai \ --enable-auto-tool-choice # Qwen3-8b 也需要 16~24GB显存,所以下载了 Qwen3-0.6B modelscope download --model Qwen/Qwen3-0.6B --local_dir /home/ss/vllm-py12/qwen3-06b # 运行 Qwen3-8b vllm serve /home/ss/vllm-py12/qwen3-06b \ --host 0.0.0.0 \ --port 8000 \ --served-model-name Qwen3-0.6B \ --tensor-parallel-size 1 \ --dtype auto \ --gpu-memory-utilization 0.9 \ --max-model-len 32768 \ --trust-remote-code ``` #### 新建了一个脚本去测试结构化输出函数的bug ```shell vim /home/ss/vllm-py12/vllm-crash-test.py ``` ```python from enum import Enum from pydantic import BaseModel from vllm import LLM, SamplingParams from vllm.sampling_params import GuidedDecodingParams # 定义结构化输出 schema class CarType(str, Enum): sedan = "sedan" suv = "SUV" truck = "Truck" coupe = "Coupe" class CarDescription(BaseModel): brand: str model: str car_type: CarType # 获取 JSON schema json_schema = CarDescription.model_json_schema() # 设置 prompt prompt = ( "Generate a JSON with the brand, model and car_type of " "the most iconic car from the 90's" ) def format_output(title: str, output: str): print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}") def main(): # 1. 初始化本地 LLM,加载本地模型文件 llm = LLM( model="/home/ss/vllm-py12/qwen3-06b", # 指向你的本地模型路径 max_model_len=1024, enable_prefix_caching=True, gpu_memory_utilization=0.9, ) # 2. 构造一个无效的 guided_decoding:没有任何有效字段 # 这将导致 get_structured_output_key() 中 raise ValueError guided_decoding_invalid = GuidedDecodingParams( json=None, json_object=False, regex=None, choice=None, grammar=None, structural_tag=None ) sampling_params = SamplingParams( temperature=0.0, max_tokens=512, guided_decoding=guided_decoding_invalid # ✅ 传入但无有效字段 ) # 3. 生成输出(预期会触发 ValueError) try: outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) for output in outputs: generated_text = output.outputs[0].text format_output("Output", generated_text) except Exception as e: print(f"Caught expected error: {e}") if __name__ == "__main__": main() ``` #### 复现 ```shell python /home/ss/vllm-py12/vllm-crash-test.py ```