#FROM nvcr.io/nvidia/tritonserver:25.05-pyt-python-py3 FROM nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3 # ──────────────────────── SYSTEM SETUP ──────────────────────── USER root RUN apt-get update -y && \ apt-get install -y git python3-pip && \ pip install gradio 'tritonclient[http]' && \ pip install -U --no-cache-dir vllm --extra-index-url https://download.pytorch.org/whl/cu128 && \ pip install flash-attn bitsandbytes #pip install flash-attn flashinfer-python bitsandbytes # ──────────────────────── HF / GIT IDENTITY ──────────────────────── RUN install -d -m 755 -o triton-server -g triton-server /home/triton-server && \ touch /home/triton-server/.gitconfig && \ chown triton-server:triton-server /home/triton-server/.gitconfig USER triton-server RUN git config --global user.email "OscarGD6@users.noreply.huggingface.co" && \ git config --global user.name "OscarGD6" # ──────────────────────── HF CACHE MOUNT ──────────────────────── USER root RUN mkdir -p /opt/triton_cache/hf && \ chown -R triton-server:triton-server /opt/triton_cache ENV HF_HOME=/opt/triton_cache/hf ENV TRANSFORMERS_CACHE=/opt/triton_cache/hf ENV VLLM_USAGE_LOGGING=disable # ──────────────────────── APP FILES ──────────────────────── WORKDIR /workspace COPY model_repository ./model_repository COPY launch.sh ./launch.sh COPY app.py ./app.py RUN chmod +x ./launch.sh USER triton-server # ──────────────────────── PORTS ──────────────────────── EXPOSE 8000 8001 8002 8008 7860 # ──────────────────────── STARTUP ──────────────────────── CMD ["./launch.sh"]