ryomo's picture
refactor: update llm_zerogpu.py and llm_modal to align each other
0300eff
raw
history blame
5 kB
import logging
import subprocess
from threading import Thread
import modal
import openai_harmony as oh
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from unpredictable_lord.tokenstreamer import TokenStreamer
logger = logging.getLogger(__name__)
APP_NAME = "unpredictable-lord"
VOLUME_NAME = APP_NAME + "-volume"
MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
MOUNT_DIR = "/data"
# https://huggingface.co/openai/gpt-oss-20b
MODEL_IDENTIFIER = "openai/gpt-oss-20b"
# MAX_MODEL_TOKENS >= Input + Output
MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
MAX_OUTPUT_TOKENS = 512
# https://modal.com/docs/guide/gpu#specifying-gpu-type
GPU_NAME = "L4"
GPU_NUM = 1 # Number of GPUs to use
GPU = f"{GPU_NAME}:{GPU_NUM}"
# https://modal.com/pricing
# | GPU | Memory | Price |
# |-----------|--------|----------|
# | B200 | 180 GB | $6.25 /h |
# | H200 | 141 GB | $4.54 /h |
# | H100 | 80 GB | $3.95 /h |
# | A100-80GB | 80 GB | $2.50 /h |
# | A100-40GB | 40 GB | $2.10 /h |
# | L40S | 48 GB | $1.95 /h |
# | A10G | 24 GB | $1.10 /h |
# | L4 | 24 GB | $0.80 /h |
# | T4 | 16 GB | $0.59 /h |
image = (
# https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
# https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
.pip_install(
[
"accelerate>=1.12.0",
"kernels>=0.11.1",
"openai-harmony>=0.0.8",
"torch>=2.9.0",
"transformers>=4.57.1",
]
)
.env(
{
"HF_HOME": MOUNT_DIR + "/huggingface",
}
)
.add_local_python_source("unpredictable_lord") # Include local package
)
app = modal.App(APP_NAME, image=image)
# Global model and tokenizer (loaded once per container)
model = None
tokenizer = None
stop_token_ids = None
def load_model():
"""Load model and tokenizer into global variables."""
global model, tokenizer, stop_token_ids
if model is not None:
return
# Ensure the cache volume is the latest
MOUNT_VOLUME.reload()
tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
model = AutoModelForCausalLM.from_pretrained(
MODEL_IDENTIFIER,
torch_dtype="auto",
device_map="auto",
)
# Commit the volume to ensure the model is saved
MOUNT_VOLUME.commit()
# Load stop token IDs
_encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
stop_token_ids = _encoding.stop_tokens_for_assistant_actions()
# Show GPU information
subprocess.run(["nvidia-smi"])
@app.function(
gpu=GPU,
volumes={MOUNT_DIR: MOUNT_VOLUME},
)
def generate_stream(input_tokens):
"""
Generate a streaming response
Args:
input_tokens (list[int]): Input token IDs
Yields:
int: Generated token IDs
"""
load_model()
if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
raise ValueError(
f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
f"Current input length: {len(input_tokens)} tokens."
)
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(model.device)
streamer = TokenStreamer()
generation_kwargs = {
"input_ids": input_ids,
"max_new_tokens": MAX_OUTPUT_TOKENS,
"eos_token_id": stop_token_ids,
"streamer": streamer,
}
# Start generation in a separate thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
while True:
token_id = streamer.token_queue.get()
if token_id == streamer.stop_signal:
break
yield token_id
thread.join()
@app.local_entrypoint()
def main():
# https://cookbook.openai.com/articles/openai-harmony#harmony-renderer-library
convo = oh.Conversation.from_messages(
[
oh.Message.from_role_and_content(oh.Role.SYSTEM, oh.SystemContent.new()),
oh.Message.from_role_and_content(
oh.Role.DEVELOPER,
oh.DeveloperContent.new().with_instructions(
"Always respond in the same language as the user."
),
),
oh.Message.from_role_and_content(
oh.Role.USER, "Hi. How is the weather today?"
),
]
)
encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
print("AI: ", end="", flush=True)
parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
for token in generate_stream.remote_gen(input_tokens):
parser.process(token)
delta = parser.last_content_delta
if delta:
print(delta, end="", flush=True)
print()