| VLLM_ALLOW_RUNTIME_LORA_UPDATING=1 VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 vllm serve ~/models/gemma2-2b \ | |
| --gpu-memory-utilization=1 \ | |
| --port 6002 \ | |
| --served-model-name="gemma" \ | |
| --trust-remote-code \ | |
| --max-model-len 8192 \ | |
| --disable-log-requests \ | |
| --enable-lora \ | |
| --lora-modules gpqa=./ \ | |
| # --guided-decoding-backend lm-format-enforcer \ | |