#FROM nvcr.io/nvidia/tritonserver:25.05-pyt-python-py3
FROM nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3

# ──────────────────────── SYSTEM SETUP ────────────────────────
USER root
RUN apt-get update -y && \
    apt-get install -y git python3-pip && \
    pip install gradio 'tritonclient[http]' && \
    pip install -U --no-cache-dir vllm --extra-index-url https://download.pytorch.org/whl/cu128 && \
    pip install flash-attn bitsandbytes
    #pip install flash-attn flashinfer-python bitsandbytes

# ──────────────────────── HF / GIT IDENTITY ────────────────────────
RUN install -d -m 755 -o triton-server -g triton-server /home/triton-server && \
    touch /home/triton-server/.gitconfig && \
    chown triton-server:triton-server /home/triton-server/.gitconfig

USER triton-server
RUN git config --global user.email "OscarGD6@users.noreply.huggingface.co" && \
    git config --global user.name  "OscarGD6"

# ──────────────────────── HF CACHE MOUNT ────────────────────────
USER root
RUN mkdir -p /opt/triton_cache/hf && \
    chown -R triton-server:triton-server /opt/triton_cache

ENV HF_HOME=/opt/triton_cache/hf
ENV TRANSFORMERS_CACHE=/opt/triton_cache/hf
ENV VLLM_USAGE_LOGGING=disable

# ──────────────────────── APP FILES ────────────────────────
WORKDIR /workspace
COPY model_repository ./model_repository
COPY launch.sh ./launch.sh
COPY app.py ./app.py
RUN chmod +x ./launch.sh
USER triton-server

# ──────────────────────── PORTS ────────────────────────
EXPOSE 8000 8001 8002 8008 7860

# ──────────────────────── STARTUP ────────────────────────
CMD ["./launch.sh"]