Spaces:

MCP-1st-Birthday
/

unpredictable-lord

Running

App Files Files Community

ryomo commited on 24 days ago

Commit

2129942

1 Parent(s): bd7f723

feat: implement LLM chat functionality with Modal

Browse files

Files changed (5) hide show

README.md +9 -1
app.py +43 -10
src/unpredictable_lord/__init__.py +0 -2
src/unpredictable_lord/chat.py +104 -0
src/unpredictable_lord/modal_main.py +203 -0

README.md CHANGED Viewed

@@ -9,7 +9,7 @@ python_version: 3.10
 app_file: app.py
 pinned: false
 license: mit
-short_description: A strategy game where the player manages a realm by advising an AI Lord.
 tags:
   - mcp-in-action-track-creative
 ---
@@ -24,6 +24,14 @@ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces
 uv sync --frozen
 ```
 ### Run locally
 ```sh

 app_file: app.py
 pinned: false
 license: mit
+short_description: A strategy game. Advise, but don't command, the AI Lord.
 tags:
   - mcp-in-action-track-creative
 ---
 uv sync --frozen
 ```
+### Modal
+If your local GPU is not powerful enough to run the model, you can deploy it to Modal 🚀
+```sh
+uv run modal deploy src/unpredictable_lord/modal_main.py
+```
 ### Run locally
 ```sh

app.py CHANGED Viewed

@@ -1,16 +1,49 @@
 import gradio as gr
-import spaces
-import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device)  # <-- 'cpu' 🤔
-@spaces.GPU
-def greet(n):
-    print(zero.device)  # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

+import os
+import sys
+# Add src directory to Python path for Hugging Face Spaces compatibility
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
 import gradio as gr
+from unpredictable_lord.chat import chat_with_llm_stream
+print("ZERO_GPU:", os.environ.get("ZERO_GPU"))
+# Gradio UI
+with gr.Blocks(title="Unpredictable Lord") as demo:
+    gr.Markdown("# Unpredictable Lord\nLord Advisor AI Simulation")
+    chatbot = gr.Chatbot(label="Lord AI", height=600, type="messages")
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Your Advice", placeholder="My Lord, I have a proposal...", scale=4
+        )
+        submit_btn = gr.Button("Submit", scale=1)
+    clear = gr.Button("Clear History")
+    def user(user_message, history):
+        # Append user message to history in messages format
+        return "", history + [{"role": "user", "content": user_message}]
+    def bot(history):
+        # The last message is the user's message
+        user_message = history[-1]["content"]
+        history_for_model = history[:-1]
+        for updated_history in chat_with_llm_stream(user_message, history_for_model):
+            yield updated_history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+if __name__ == "__main__":
+    demo.launch()

src/unpredictable_lord/__init__.py CHANGED Viewed

	@@ -1,2 +0,0 @@
1	- def main() -> None:
2	- print("Hello from unpredictable-lord!")

src/unpredictable_lord/chat.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+AI chat functionality implementation
+Provides chat functionality by calling the LLM endpoint on Modal.
+"""
+import modal
+import openai_harmony as oh
+APP_NAME = "unpredictable-lord"
+LLMModel = modal.Cls.from_name(APP_NAME, "LLMModel")
+model = LLMModel()
+def chat_with_llm_stream(
+    user_message: str,
+    chat_history: list[dict[str, str]],
+):
+    """
+    Chat with LLM (streaming version)
+    Args:
+        user_message: User's message
+        chat_history: Past chat history (list of dictionaries in Gradio format)
+                      [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
+    Yields:
+        updated_chat_history: Updated chat history (Gradio format)
+    """
+    try:
+        # 1. Build message list for LLM request (openai-harmony format)
+        messages = []
+        # System message
+        system_content = (
+            oh.SystemContent.new()
+            .with_model_identity(
+                "You are a lord of a medieval fantasy kingdom. The user is your advisor."
+                "Listen to your advisor's advice and act for the development of your territory and the maintenance of your authority."
+                "Speak in an arrogant tone."
+            )
+            .with_reasoning_effort(oh.ReasoningEffort.LOW)
+            # .with_conversation_start_date("2025-11-21")
+        )
+        messages.append(
+            oh.Message.from_role_and_content(oh.Role.SYSTEM, system_content)
+        )
+        # Convert past history to openai-harmony format and add
+        for msg in chat_history:
+            if msg["role"] == "user":
+                messages.append(
+                    oh.Message.from_role_and_content(oh.Role.USER, msg["content"])
+                )
+            elif msg["role"] == "assistant":
+                messages.append(
+                    oh.Message.from_role_and_content(oh.Role.ASSISTANT, msg["content"])
+                )
+        # Add current user message
+        messages.append(oh.Message.from_role_and_content(oh.Role.USER, user_message))
+        # Encode message
+        convo = oh.Conversation.from_messages(messages)
+        encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+        input_tokens = encoding.render_conversation_for_completion(
+            convo, oh.Role.ASSISTANT
+        )
+        parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
+        # 2. Build history list for UI display (Gradio format)
+        # Add user message and empty assistant message to be generated
+        partial_history = chat_history + [
+            {"role": "user", "content": user_message},
+            {"role": "assistant", "content": ""},
+        ]
+        # Streaming generation
+        generater = model.generate_stream.remote_gen(input_tokens)
+        response_text = ""
+        for token in generater:
+            if token is None:
+                continue
+            parser.process(token)
+            # Get content only from final channel
+            if parser.current_channel == "final":
+                delta = parser.last_content_delta
+                if delta:
+                    response_text += delta
+                    # Update history and yield
+                    partial_history[-1]["content"] = response_text
+                    yield partial_history
+    except Exception as e:
+        error_message = f"An error occurred: {str(e)}"
+        print(error_message)
+        updated_history = chat_history + [
+            {"role": "user", "content": user_message},
+            {"role": "assistant", "content": error_message},
+        ]
+        yield updated_history

src/unpredictable_lord/modal_main.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import subprocess
+from queue import Queue
+from threading import Thread
+import modal
+import openai_harmony as oh
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.streamers import BaseStreamer
+APP_NAME = "unpredictable-lord"
+VOLUME_NAME = APP_NAME + "-volume"
+MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
+MOUNT_DIR = "/data"
+# https://huggingface.co/openai/gpt-oss-20b
+MODEL_IDENTIFIER = "openai/gpt-oss-20b"
+# https://huggingface.co/openai/gpt-oss-120b
+# MODEL_IDENTIFIER = "openai/gpt-oss-120b"
+# https://modal.com/docs/guide/gpu#specifying-gpu-type
+GPU_NAME = "L4"
+GPU_NUM = 1  # Number of GPUs to use
+GPU = f"{GPU_NAME}:{GPU_NUM}"
+# https://modal.com/pricing
+# | GPU       | Memory | Price    |
+# |-----------|--------|----------|
+# | B200      | 180 GB | $6.25 /h |
+# | H200      | 141 GB | $4.54 /h |
+# | H100      |  80 GB | $3.95 /h |
+# | A100-80GB |  80 GB | $2.50 /h |
+# | A100-40GB |  40 GB | $2.10 /h |
+# | L40S      |  48 GB | $1.95 /h |
+# | A10G      |  24 GB | $1.10 /h |
+# | L4        |  24 GB | $0.80 /h |
+# | T4        |  16 GB | $0.59 /h |
+# MAX_MODEL_TOKENS >= Input + Output
+MAX_MODEL_TOKENS = 64 * 1024  # gpt-oss models support up to 128k(128*1024) tokens
+MAX_OUTPUT_TOKENS = 512
+image = (
+    # https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
+    # https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
+    modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
+    .pip_install(
+        [
+            "accelerate>=1.12.0",
+            "kernels>=0.11.1",
+            "openai-harmony>=0.0.8",
+            "torch>=2.9.0",
+            "transformers>=4.57.1",
+        ]
+    )
+    .env(
+        {
+            "HF_HOME": MOUNT_DIR + "/huggingface",
+        }
+    )
+)
+app = modal.App(APP_NAME, image=image)
+# NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
+# https://modal.com/docs/guide/lifecycle-functions
+class TokenStreamer(BaseStreamer):
+    """
+    Streamer that queues token IDs directly.
+    NOTE: transformers' TextStreamer returns decoded text, but
+        OpenAI Harmony's parser requires token IDs, so it cannot be used.
+        Therefore, I implemented a custom streamer that queues token IDs directly.
+    """
+    def __init__(self, timeout=None):
+        self.token_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def put(self, value):
+        if value.ndim > 1:
+            value = value.flatten()
+        for token in value:
+            self.token_queue.put(token.item())
+    def end(self):
+        self.token_queue.put(self.stop_signal)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.token_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        return value
+@app.cls(
+    gpu=GPU,
+    image=image,
+    volumes={MOUNT_DIR: MOUNT_VOLUME},
+    # secrets=[modal.Secret.from_name("huggingface-secret")],
+    # scaledown_window=15 * 60,
+    # timeout=30 * 60,
+)
+class LLMModel:
+    @modal.enter()
+    def setup(self):
+        # Ensure the cache volume is the latest
+        MOUNT_VOLUME.reload()
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            MODEL_IDENTIFIER,
+            dtype="auto",
+            device_map="auto",
+        )
+        # Commit the volume to ensure the model is saved
+        MOUNT_VOLUME.commit()
+        self.encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+        self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()
+        # Show GPU information
+        subprocess.run(["nvidia-smi"])
+    @modal.method()
+    def generate_stream(self, input_tokens, _=None):
+        """
+        Generate a streaming response
+        Args:
+            input_tokens (list[int]): Input token IDs
+            _ : Dummy parameter for compatibility
+        """
+        if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
+            raise ValueError(
+                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
+                f"Current input length: {len(input_tokens)} tokens."
+            )
+        input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.model.device)
+        streamer = TokenStreamer()
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "max_new_tokens": MAX_OUTPUT_TOKENS,
+            "eos_token_id": self.stop_token_ids,
+            "streamer": streamer,
+        }
+        # Start generation in a separate thread
+        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+        while True:
+            token_id = streamer.token_queue.get()
+            if token_id == streamer.stop_signal:
+                break
+            yield token_id
+        thread.join()
+@app.local_entrypoint()
+def main():
+    # https://cookbook.openai.com/articles/openai-harmony#harmony-renderer-library
+    convo = oh.Conversation.from_messages(
+        [
+            oh.Message.from_role_and_content(oh.Role.SYSTEM, oh.SystemContent.new()),
+            oh.Message.from_role_and_content(
+                oh.Role.DEVELOPER,
+                oh.DeveloperContent.new().with_instructions(
+                    "Always respond in the same language as the user."
+                ),
+            ),
+            oh.Message.from_role_and_content(
+                oh.Role.USER, "Hi. How is the weather today?"
+            ),
+        ]
+    )
+    model = LLMModel()
+    encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+    input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
+    print("AI: ", end="", flush=True)
+    parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
+    for token in model.generate_stream.remote_gen(input_tokens):
+        parser.process(token)
+        delta = parser.last_content_delta
+        if delta:
+            print(delta, end="", flush=True)
+    print()