import logging import subprocess from threading import Thread import modal import openai_harmony as oh import torch from transformers import AutoModelForCausalLM, AutoTokenizer from unpredictable_lord.tokenstreamer import TokenStreamer logger = logging.getLogger(__name__) APP_NAME = "unpredictable-lord" VOLUME_NAME = APP_NAME + "-volume" MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True) MOUNT_DIR = "/data" # https://huggingface.co/openai/gpt-oss-20b MODEL_IDENTIFIER = "openai/gpt-oss-20b" # MAX_MODEL_TOKENS >= Input + Output MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens MAX_OUTPUT_TOKENS = 512 # https://modal.com/docs/guide/gpu#specifying-gpu-type GPU_NAME = "L4" GPU_NUM = 1 # Number of GPUs to use GPU = f"{GPU_NAME}:{GPU_NUM}" # https://modal.com/pricing # | GPU | Memory | Price | # |-----------|--------|----------| # | B200 | 180 GB | $6.25 /h | # | H200 | 141 GB | $4.54 /h | # | H100 | 80 GB | $3.95 /h | # | A100-80GB | 80 GB | $2.50 /h | # | A100-40GB | 40 GB | $2.10 /h | # | L40S | 48 GB | $1.95 /h | # | A10G | 24 GB | $1.10 /h | # | L4 | 24 GB | $0.80 /h | # | T4 | 16 GB | $0.59 /h | image = ( # https://hub.docker.com/r/nvidia/cuda/tags?name=12.8 # https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04 modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12") .pip_install( [ "accelerate>=1.12.0", "kernels>=0.11.1", "openai-harmony>=0.0.8", "torch>=2.9.0", "transformers>=4.57.1", ] ) .env( { "HF_HOME": MOUNT_DIR + "/huggingface", } ) .add_local_python_source("unpredictable_lord") # Include local package ) app = modal.App(APP_NAME, image=image) # Global model and tokenizer (loaded once per container) model = None tokenizer = None stop_token_ids = None def load_model(): """Load model and tokenizer into global variables.""" global model, tokenizer, stop_token_ids if model is not None: return # Ensure the cache volume is the latest MOUNT_VOLUME.reload() tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER) model = AutoModelForCausalLM.from_pretrained( MODEL_IDENTIFIER, torch_dtype="auto", device_map="auto", ) # Commit the volume to ensure the model is saved MOUNT_VOLUME.commit() # Load stop token IDs _encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS) stop_token_ids = _encoding.stop_tokens_for_assistant_actions() # Show GPU information subprocess.run(["nvidia-smi"]) @app.function( gpu=GPU, volumes={MOUNT_DIR: MOUNT_VOLUME}, ) def generate_stream(input_tokens): """ Generate a streaming response Args: input_tokens (list[int]): Input token IDs Yields: int: Generated token IDs """ load_model() if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS: raise ValueError( f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. " f"Current input length: {len(input_tokens)} tokens." ) input_ids = torch.tensor([input_tokens], dtype=torch.long).to(model.device) streamer = TokenStreamer() generation_kwargs = { "input_ids": input_ids, "max_new_tokens": MAX_OUTPUT_TOKENS, "eos_token_id": stop_token_ids, "streamer": streamer, } # Start generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() while True: token_id = streamer.token_queue.get() if token_id == streamer.stop_signal: break yield token_id thread.join() @app.local_entrypoint() def main(): # https://cookbook.openai.com/articles/openai-harmony#harmony-renderer-library convo = oh.Conversation.from_messages( [ oh.Message.from_role_and_content(oh.Role.SYSTEM, oh.SystemContent.new()), oh.Message.from_role_and_content( oh.Role.DEVELOPER, oh.DeveloperContent.new().with_instructions( "Always respond in the same language as the user." ), ), oh.Message.from_role_and_content( oh.Role.USER, "Hi. How is the weather today?" ), ] ) encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS) input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT) print("AI: ", end="", flush=True) parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT) for token in generate_stream.remote_gen(input_tokens): parser.process(token) delta = parser.last_content_delta if delta: print(delta, end="", flush=True) print()