文件名 SGlang确定性推理解决方案-启动命令.md

SGlang确定性推理解决方案-启动命令

正文

services:

qwen3-8B:

image: lmsysorg/sglang:v0.5.5.post3-cu129-amd64

container_name: Qwen3-8B-Med

ipc: "host"

volumes:

  - /root/.cache/huggingface/hub:/root/.cache/huggingface/hub

  - /data/models/qwen3-8B:/data/model

  - /data/workspace/qx/medllm/output/lora/v10-20251129-045958/checkpoint-5400:/data/lora

restart: always

shm_size: '96gb'

network_mode: "host"

environment:

  - NCCL_DEBUG=TRACE

  - NCCL_IGNORE_DISABLED_P2P=1

command: [

  "python3", "-m", "sglang.launch_server",

  "--model-path", "/data/model",

  "--reasoning-parser", "qwen3",

#   "--context-length", "40960",

  "--served-model-name", "Qwen3-8B-Med",

  "--host","0.0.0.0",

  "--port", "10170",

  "--tp", "1",

  "--dp", "4",

  "--tool-call-parser", "deepseekv3",

#   "--allow-auto-truncate",

#   "--enable-dp-attention",

#   "--enable-mixed-chunk",

  "--enable-lora",

  "--lora-paths","lora0=/data/lora",

  "--max-loras-per-batch","1",

  # "--disable-radix-cache",

  "--enable-deterministic-inference",

  # "--attention-backend","torch_native",

  #"--mem-fraction-static", "0.7",

  #"--speculative-num-steps", "3" ,

  #"--speculative-eagle-topk", "1",

  #"--speculative-num-draft-tokens", "4",

  #"--speculative-algo", "NEXTN",

]

healthcheck:

  test: ["CMD", "curl", "-f", "[http://localhost:30000/health"]](http://localhost:30000/health"])

  interval: 60s                                 

  timeout: 10s                                            

  retries: 3                                            

  start_period: 240s                                       


deploy:

  resources:

    reservations:

      devices:

        - driver: nvidia

          device_ids: ['0','1','2','3'] # '0','1','2','3','4','5','6','7'

          capabilities: [gpu]