正文
services:
qwen3-8B:
image: lmsysorg/sglang:v0.5.5.post3-cu129-amd64
container_name: Qwen3-8B-Med
ipc: "host"
volumes:
- /root/.cache/huggingface/hub:/root/.cache/huggingface/hub
- /data/models/qwen3-8B:/data/model
- /data/workspace/qx/medllm/output/lora/v10-20251129-045958/checkpoint-5400:/data/lora
restart: always
shm_size: '96gb'
network_mode: "host"
environment:
- NCCL_DEBUG=TRACE
- NCCL_IGNORE_DISABLED_P2P=1
command: [
"python3", "-m", "sglang.launch_server",
"--model-path", "/data/model",
"--reasoning-parser", "qwen3",
# "--context-length", "40960",
"--served-model-name", "Qwen3-8B-Med",
"--host","0.0.0.0",
"--port", "10170",
"--tp", "1",
"--dp", "4",
"--tool-call-parser", "deepseekv3",
# "--allow-auto-truncate",
# "--enable-dp-attention",
# "--enable-mixed-chunk",
"--enable-lora",
"--lora-paths","lora0=/data/lora",
"--max-loras-per-batch","1",
# "--disable-radix-cache",
"--enable-deterministic-inference",
# "--attention-backend","torch_native",
#"--mem-fraction-static", "0.7",
#"--speculative-num-steps", "3" ,
#"--speculative-eagle-topk", "1",
#"--speculative-num-draft-tokens", "4",
#"--speculative-algo", "NEXTN",
]
healthcheck:
test: ["CMD", "curl", "-f", "[http://localhost:30000/health"]](http://localhost:30000/health"])
interval: 60s
timeout: 10s
retries: 3
start_period: 240s
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0','1','2','3'] # '0','1','2','3','4','5','6','7'
capabilities: [gpu]