Spaces:
Build error
Build error
Promise Emmanuel Oluwadare commited on
Commit ·
1efb585
1
Parent(s): 266954d
Add Qwen3.5-0.8B OpenAI-compatible server
Browse files- .dockerignore +11 -0
- Dockerfile +32 -0
- README.md +58 -4
- requirements.txt +2 -0
- start_server.py +61 -0
.dockerignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
__pycache__
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
*.pyd
|
| 7 |
+
.Python
|
| 8 |
+
venv
|
| 9 |
+
.venv
|
| 10 |
+
dist
|
| 11 |
+
build
|
Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1 \
|
| 6 |
+
CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
|
| 7 |
+
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
build-essential \
|
| 10 |
+
cmake \
|
| 11 |
+
pkg-config \
|
| 12 |
+
libopenblas-dev \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
COPY requirements.txt /app/requirements.txt
|
| 18 |
+
RUN pip install --upgrade pip && pip install -r /app/requirements.txt
|
| 19 |
+
|
| 20 |
+
COPY start_server.py /app/start_server.py
|
| 21 |
+
|
| 22 |
+
ENV PORT=7860 \
|
| 23 |
+
MODEL_REPO=unsloth/Qwen3.5-0.8B-GGUF \
|
| 24 |
+
MODEL_FILE=Qwen3.5-0.8B-Q4_K_M.gguf \
|
| 25 |
+
MODEL_DIR=/tmp/models \
|
| 26 |
+
N_CTX=4096 \
|
| 27 |
+
N_THREADS=4 \
|
| 28 |
+
CHAT_FORMAT=chatml
|
| 29 |
+
|
| 30 |
+
EXPOSE 7860
|
| 31 |
+
|
| 32 |
+
CMD ["python", "/app/start_server.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,64 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: gray
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Qwen3.5 0.8B OpenAI API
|
| 3 |
+
emoji: "🧠"
|
| 4 |
+
colorFrom: amber
|
| 5 |
colorTo: gray
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Hugging Face Space Template (OpenAI-Compatible Qwen 0.8B)
|
| 12 |
+
|
| 13 |
+
This folder is ready to be used as a Docker Space that serves `Qwen3.5-0.8B` behind OpenAI-style endpoints:
|
| 14 |
+
|
| 15 |
+
- `GET /v1/models`
|
| 16 |
+
- `POST /v1/chat/completions` (with streaming)
|
| 17 |
+
|
| 18 |
+
## 1) Create the Space
|
| 19 |
+
|
| 20 |
+
1. Go to Hugging Face -> **New Space**.
|
| 21 |
+
2. Select **Docker** SDK.
|
| 22 |
+
3. Choose hardware:
|
| 23 |
+
- For free testing: **CPU Basic**.
|
| 24 |
+
4. Create the Space.
|
| 25 |
+
|
| 26 |
+
## 2) Upload these files
|
| 27 |
+
|
| 28 |
+
Upload all files from this folder to the root of that Space repository:
|
| 29 |
+
|
| 30 |
+
- `Dockerfile`
|
| 31 |
+
- `requirements.txt`
|
| 32 |
+
- `start_server.py`
|
| 33 |
+
- `.dockerignore`
|
| 34 |
+
- `README.md` (this file)
|
| 35 |
+
|
| 36 |
+
## 3) Set Space Variables (Settings -> Variables and secrets)
|
| 37 |
+
|
| 38 |
+
Recommended defaults:
|
| 39 |
+
|
| 40 |
+
- `MODEL_REPO=unsloth/Qwen3.5-0.8B-GGUF`
|
| 41 |
+
- `MODEL_FILE=Qwen3.5-0.8B-Q4_K_M.gguf`
|
| 42 |
+
- `N_CTX=4096`
|
| 43 |
+
- `N_THREADS=4`
|
| 44 |
+
- `CHAT_FORMAT=chatml`
|
| 45 |
+
|
| 46 |
+
Optional:
|
| 47 |
+
|
| 48 |
+
- `API_KEY=<your-secret>` to require bearer auth.
|
| 49 |
+
- `HF_TOKEN=<token>` if your model repo is private.
|
| 50 |
+
|
| 51 |
+
## 4) Connect frontend
|
| 52 |
+
|
| 53 |
+
In this app's Settings:
|
| 54 |
+
|
| 55 |
+
- Preset: `Hugging Face Space`
|
| 56 |
+
- Base URL: `https://<your-space-name>.hf.space/v1`
|
| 57 |
+
- Model Name: `Qwen3.5-0.8B-Q4_K_M.gguf`
|
| 58 |
+
- API Key: only if you set `API_KEY` in the Space
|
| 59 |
+
|
| 60 |
+
## Notes
|
| 61 |
+
|
| 62 |
+
- Free CPU Spaces can sleep when idle and cold-start slowly.
|
| 63 |
+
- First boot includes model download, so startup may take a few minutes.
|
| 64 |
+
- If you hit memory pressure, use a smaller GGUF quantization file.
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
llama-cpp-python[server]>=0.2.90
|
| 2 |
+
huggingface_hub>=0.25.0
|
start_server.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def read_env(name: str, default: str) -> str:
|
| 10 |
+
value = os.getenv(name, default).strip()
|
| 11 |
+
return value or default
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main() -> None:
|
| 15 |
+
repo_id = read_env("MODEL_REPO", "unsloth/Qwen3.5-0.8B-GGUF")
|
| 16 |
+
filename = read_env("MODEL_FILE", "Qwen3.5-0.8B-Q4_K_M.gguf")
|
| 17 |
+
model_dir = Path(read_env("MODEL_DIR", "/tmp/models"))
|
| 18 |
+
port = read_env("PORT", "7860")
|
| 19 |
+
n_ctx = read_env("N_CTX", "4096")
|
| 20 |
+
n_threads = read_env("N_THREADS", "4")
|
| 21 |
+
chat_format = read_env("CHAT_FORMAT", "chatml")
|
| 22 |
+
api_key = os.getenv("API_KEY", "").strip()
|
| 23 |
+
|
| 24 |
+
model_dir.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
token = os.getenv("HF_TOKEN", "").strip() or os.getenv("HUGGING_FACE_HUB_TOKEN", "").strip() or None
|
| 27 |
+
model_path = hf_hub_download(
|
| 28 |
+
repo_id=repo_id,
|
| 29 |
+
filename=filename,
|
| 30 |
+
token=token,
|
| 31 |
+
local_dir=str(model_dir),
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
command = [
|
| 35 |
+
sys.executable,
|
| 36 |
+
"-m",
|
| 37 |
+
"llama_cpp.server",
|
| 38 |
+
"--model",
|
| 39 |
+
model_path,
|
| 40 |
+
"--host",
|
| 41 |
+
"0.0.0.0",
|
| 42 |
+
"--port",
|
| 43 |
+
port,
|
| 44 |
+
"--n_ctx",
|
| 45 |
+
n_ctx,
|
| 46 |
+
"--n_threads",
|
| 47 |
+
n_threads,
|
| 48 |
+
"--chat_format",
|
| 49 |
+
chat_format,
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
if api_key:
|
| 53 |
+
command.extend(["--api_key", api_key])
|
| 54 |
+
|
| 55 |
+
print("Starting OpenAI-compatible model server:")
|
| 56 |
+
print(" ".join(command))
|
| 57 |
+
subprocess.run(command, check=True)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
main()
|