1
0
md/20251014.md
liushuang 25e718d9a6 add
2025-10-15 10:32:37 +08:00

3.4 KiB
Raw Blame History

RTX4090笔电操作记录

# 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里
cat /etc/apt/sources.list

# 安装 openssh 端口号是默认的 22 没有修改
sudo apt install openssh-server -y
sudo systemctl enable ssh
sudo systemctl start ssh

# 安装 NVDIA 显卡驱动和
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-8
sudo apt-get install -y cuda-drivers
nvidia-smi

# 安装 nvidia-cuda-toolkit
apt install nvidia-cuda-toolkit
nvcc -V

# 创建了一个新的目录,用于存储 vllm 使用的模型或其他文件
mkdir /home/ss/vllm-py12 && cd /home/ss/vllm-py12

# 用 conda 建了个新环境,以下 pip install 都是在该环境执行的
conda create -n vllm-py12 python=3.12 -y
conda activate vllm-py12

# 安装 vllm
pip install vllm -i http://mirrors.cloud.tencent.com/pypi/simple --extra-index-url https://download.pytorch.org/whl/cu128

# 安装 modelscope
pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

# 拉取 gpt-oss-20b 模型
modelscope download --model openai-mirror/gpt-oss-20b --local_dir /home/ss/vllm-py12/gpt-oss-20b

# 运行 gpt-oss-20b 模型失败,移动端的 RTX4090 只有 16GB 显存,至少需要 16~24GB 显存
vllm serve \
    /home/ss/vllm-py12/gpt-oss-20b \
    --port 18777 \
    --api-key token_lcfc \
    --served-model-name gpt-oss-20b \
    --gpu-memory-utilization 0.95  \
    --tool-call-parser openai \
    --enable-auto-tool-choice

# Qwen3-8b 也需要 16~24GB显存所以下载了 Qwen3-0.6B 
modelscope download --model Qwen/Qwen3-0.6B --local_dir /home/ss/vllm-py12/qwen3-06b

# 运行 Qwen3-8b 
vllm serve /home/ss/vllm-py12/qwen3-06b \
    --host 0.0.0.0 \
    --port 8000 \
    --served-model-name Qwen3-0.6B \
    --tensor-parallel-size 1 \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 32768 \
    --trust-remote-code

新建了一个脚本去测试结构化输出函数的bug

vim /home/ss/vllm-py12/vllm-crash-test.py
from enum import Enum
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

json_schema = CarDescription.model_json_schema()
# guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
sampling_params_json = SamplingParams(guided_decoding={})
prompt_json = (
    "Generate a JSON with the brand, model and car_type of"
    "the most iconic car from the 90's"
)

def format_output(title: str, output: str):
    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")

def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
    return outputs[0].outputs[0].text

def main():
    llm = LLM(model="qwen", max_model_len=100)
    json_output = generate_output(prompt_json, sampling_params_json, llm)
    format_output("Guided decoding by JSON", json_output)

if __name__ == "__main__":
    main()