ryomo commited on
Commit
2129942
·
1 Parent(s): bd7f723

feat: implement LLM chat functionality with Modal

Browse files
README.md CHANGED
@@ -9,7 +9,7 @@ python_version: 3.10
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
- short_description: A strategy game where the player manages a realm by advising an AI Lord.
13
  tags:
14
  - mcp-in-action-track-creative
15
  ---
@@ -24,6 +24,14 @@ Check out the configuration reference at <https://huggingface.co/docs/hub/spaces
24
  uv sync --frozen
25
  ```
26
 
 
 
 
 
 
 
 
 
27
  ### Run locally
28
 
29
  ```sh
 
9
  app_file: app.py
10
  pinned: false
11
  license: mit
12
+ short_description: A strategy game. Advise, but don't command, the AI Lord.
13
  tags:
14
  - mcp-in-action-track-creative
15
  ---
 
24
  uv sync --frozen
25
  ```
26
 
27
+ ### Modal
28
+
29
+ If your local GPU is not powerful enough to run the model, you can deploy it to Modal 🚀
30
+
31
+ ```sh
32
+ uv run modal deploy src/unpredictable_lord/modal_main.py
33
+ ```
34
+
35
  ### Run locally
36
 
37
  ```sh
app.py CHANGED
@@ -1,16 +1,49 @@
 
 
 
 
 
 
1
  import gradio as gr
2
- import spaces
3
- import torch
4
 
5
- zero = torch.Tensor([0]).cuda()
6
- print(zero.device) # <-- 'cpu' 🤔
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
8
 
9
- @spaces.GPU
10
- def greet(n):
11
- print(zero.device) # <-- 'cuda:0' 🤗
12
- return f"Hello {zero + n} Tensor"
 
 
 
13
 
14
 
15
- demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
16
- demo.launch()
 
1
+ import os
2
+ import sys
3
+
4
+ # Add src directory to Python path for Hugging Face Spaces compatibility
5
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
6
+
7
  import gradio as gr
 
 
8
 
9
+ from unpredictable_lord.chat import chat_with_llm_stream
10
+
11
+ print("ZERO_GPU:", os.environ.get("ZERO_GPU"))
12
+
13
+ # Gradio UI
14
+ with gr.Blocks(title="Unpredictable Lord") as demo:
15
+ gr.Markdown("# Unpredictable Lord\nLord Advisor AI Simulation")
16
+
17
+ chatbot = gr.Chatbot(label="Lord AI", height=600, type="messages")
18
+
19
+ with gr.Row():
20
+ msg = gr.Textbox(
21
+ label="Your Advice", placeholder="My Lord, I have a proposal...", scale=4
22
+ )
23
+ submit_btn = gr.Button("Submit", scale=1)
24
+
25
+ clear = gr.Button("Clear History")
26
+
27
+ def user(user_message, history):
28
+ # Append user message to history in messages format
29
+ return "", history + [{"role": "user", "content": user_message}]
30
+
31
+ def bot(history):
32
+ # The last message is the user's message
33
+ user_message = history[-1]["content"]
34
+ history_for_model = history[:-1]
35
 
36
+ for updated_history in chat_with_llm_stream(user_message, history_for_model):
37
+ yield updated_history
38
 
39
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
40
+ bot, chatbot, chatbot
41
+ )
42
+ submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
43
+ bot, chatbot, chatbot
44
+ )
45
+ clear.click(lambda: None, None, chatbot, queue=False)
46
 
47
 
48
+ if __name__ == "__main__":
49
+ demo.launch()
src/unpredictable_lord/__init__.py CHANGED
@@ -1,2 +0,0 @@
1
- def main() -> None:
2
- print("Hello from unpredictable-lord!")
 
 
 
src/unpredictable_lord/chat.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI chat functionality implementation
3
+
4
+ Provides chat functionality by calling the LLM endpoint on Modal.
5
+ """
6
+
7
+ import modal
8
+ import openai_harmony as oh
9
+
10
+ APP_NAME = "unpredictable-lord"
11
+ LLMModel = modal.Cls.from_name(APP_NAME, "LLMModel")
12
+ model = LLMModel()
13
+
14
+
15
+ def chat_with_llm_stream(
16
+ user_message: str,
17
+ chat_history: list[dict[str, str]],
18
+ ):
19
+ """
20
+ Chat with LLM (streaming version)
21
+
22
+ Args:
23
+ user_message: User's message
24
+ chat_history: Past chat history (list of dictionaries in Gradio format)
25
+ [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
26
+
27
+ Yields:
28
+ updated_chat_history: Updated chat history (Gradio format)
29
+ """
30
+ try:
31
+ # 1. Build message list for LLM request (openai-harmony format)
32
+ messages = []
33
+
34
+ # System message
35
+ system_content = (
36
+ oh.SystemContent.new()
37
+ .with_model_identity(
38
+ "You are a lord of a medieval fantasy kingdom. The user is your advisor."
39
+ "Listen to your advisor's advice and act for the development of your territory and the maintenance of your authority."
40
+ "Speak in an arrogant tone."
41
+ )
42
+ .with_reasoning_effort(oh.ReasoningEffort.LOW)
43
+ # .with_conversation_start_date("2025-11-21")
44
+ )
45
+ messages.append(
46
+ oh.Message.from_role_and_content(oh.Role.SYSTEM, system_content)
47
+ )
48
+
49
+ # Convert past history to openai-harmony format and add
50
+ for msg in chat_history:
51
+ if msg["role"] == "user":
52
+ messages.append(
53
+ oh.Message.from_role_and_content(oh.Role.USER, msg["content"])
54
+ )
55
+ elif msg["role"] == "assistant":
56
+ messages.append(
57
+ oh.Message.from_role_and_content(oh.Role.ASSISTANT, msg["content"])
58
+ )
59
+
60
+ # Add current user message
61
+ messages.append(oh.Message.from_role_and_content(oh.Role.USER, user_message))
62
+
63
+ # Encode message
64
+ convo = oh.Conversation.from_messages(messages)
65
+ encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
66
+ input_tokens = encoding.render_conversation_for_completion(
67
+ convo, oh.Role.ASSISTANT
68
+ )
69
+
70
+ parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
71
+
72
+ # 2. Build history list for UI display (Gradio format)
73
+ # Add user message and empty assistant message to be generated
74
+ partial_history = chat_history + [
75
+ {"role": "user", "content": user_message},
76
+ {"role": "assistant", "content": ""},
77
+ ]
78
+
79
+ # Streaming generation
80
+ generater = model.generate_stream.remote_gen(input_tokens)
81
+
82
+ response_text = ""
83
+ for token in generater:
84
+ if token is None:
85
+ continue
86
+ parser.process(token)
87
+
88
+ # Get content only from final channel
89
+ if parser.current_channel == "final":
90
+ delta = parser.last_content_delta
91
+ if delta:
92
+ response_text += delta
93
+ # Update history and yield
94
+ partial_history[-1]["content"] = response_text
95
+ yield partial_history
96
+
97
+ except Exception as e:
98
+ error_message = f"An error occurred: {str(e)}"
99
+ print(error_message)
100
+ updated_history = chat_history + [
101
+ {"role": "user", "content": user_message},
102
+ {"role": "assistant", "content": error_message},
103
+ ]
104
+ yield updated_history
src/unpredictable_lord/modal_main.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ from queue import Queue
3
+ from threading import Thread
4
+
5
+ import modal
6
+ import openai_harmony as oh
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+ from transformers.generation.streamers import BaseStreamer
10
+
11
+ APP_NAME = "unpredictable-lord"
12
+ VOLUME_NAME = APP_NAME + "-volume"
13
+ MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
14
+ MOUNT_DIR = "/data"
15
+
16
+ # https://huggingface.co/openai/gpt-oss-20b
17
+ MODEL_IDENTIFIER = "openai/gpt-oss-20b"
18
+ # https://huggingface.co/openai/gpt-oss-120b
19
+ # MODEL_IDENTIFIER = "openai/gpt-oss-120b"
20
+
21
+ # https://modal.com/docs/guide/gpu#specifying-gpu-type
22
+ GPU_NAME = "L4"
23
+ GPU_NUM = 1 # Number of GPUs to use
24
+ GPU = f"{GPU_NAME}:{GPU_NUM}"
25
+
26
+ # https://modal.com/pricing
27
+ # | GPU | Memory | Price |
28
+ # |-----------|--------|----------|
29
+ # | B200 | 180 GB | $6.25 /h |
30
+ # | H200 | 141 GB | $4.54 /h |
31
+ # | H100 | 80 GB | $3.95 /h |
32
+ # | A100-80GB | 80 GB | $2.50 /h |
33
+ # | A100-40GB | 40 GB | $2.10 /h |
34
+ # | L40S | 48 GB | $1.95 /h |
35
+ # | A10G | 24 GB | $1.10 /h |
36
+ # | L4 | 24 GB | $0.80 /h |
37
+ # | T4 | 16 GB | $0.59 /h |
38
+
39
+ # MAX_MODEL_TOKENS >= Input + Output
40
+ MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
41
+ MAX_OUTPUT_TOKENS = 512
42
+
43
+ image = (
44
+ # https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
45
+ # https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
46
+ modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
47
+ .pip_install(
48
+ [
49
+ "accelerate>=1.12.0",
50
+ "kernels>=0.11.1",
51
+ "openai-harmony>=0.0.8",
52
+ "torch>=2.9.0",
53
+ "transformers>=4.57.1",
54
+ ]
55
+ )
56
+ .env(
57
+ {
58
+ "HF_HOME": MOUNT_DIR + "/huggingface",
59
+ }
60
+ )
61
+ )
62
+
63
+ app = modal.App(APP_NAME, image=image)
64
+
65
+ # NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
66
+ # https://modal.com/docs/guide/lifecycle-functions
67
+
68
+
69
+ class TokenStreamer(BaseStreamer):
70
+ """
71
+ Streamer that queues token IDs directly.
72
+
73
+ NOTE: transformers' TextStreamer returns decoded text, but
74
+ OpenAI Harmony's parser requires token IDs, so it cannot be used.
75
+ Therefore, I implemented a custom streamer that queues token IDs directly.
76
+ """
77
+
78
+ def __init__(self, timeout=None):
79
+ self.token_queue = Queue()
80
+ self.stop_signal = None
81
+ self.timeout = timeout
82
+
83
+ def put(self, value):
84
+ if value.ndim > 1:
85
+ value = value.flatten()
86
+ for token in value:
87
+ self.token_queue.put(token.item())
88
+
89
+ def end(self):
90
+ self.token_queue.put(self.stop_signal)
91
+
92
+ def __iter__(self):
93
+ return self
94
+
95
+ def __next__(self):
96
+ value = self.token_queue.get(timeout=self.timeout)
97
+ if value == self.stop_signal:
98
+ raise StopIteration()
99
+ return value
100
+
101
+
102
+ @app.cls(
103
+ gpu=GPU,
104
+ image=image,
105
+ volumes={MOUNT_DIR: MOUNT_VOLUME},
106
+ # secrets=[modal.Secret.from_name("huggingface-secret")],
107
+ # scaledown_window=15 * 60,
108
+ # timeout=30 * 60,
109
+ )
110
+ class LLMModel:
111
+ @modal.enter()
112
+ def setup(self):
113
+ # Ensure the cache volume is the latest
114
+ MOUNT_VOLUME.reload()
115
+
116
+ self.tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
117
+ self.model = AutoModelForCausalLM.from_pretrained(
118
+ MODEL_IDENTIFIER,
119
+ dtype="auto",
120
+ device_map="auto",
121
+ )
122
+
123
+ # Commit the volume to ensure the model is saved
124
+ MOUNT_VOLUME.commit()
125
+
126
+ self.encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
127
+ self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()
128
+
129
+ # Show GPU information
130
+ subprocess.run(["nvidia-smi"])
131
+
132
+ @modal.method()
133
+ def generate_stream(self, input_tokens, _=None):
134
+ """
135
+ Generate a streaming response
136
+
137
+ Args:
138
+ input_tokens (list[int]): Input token IDs
139
+ _ : Dummy parameter for compatibility
140
+ """
141
+
142
+ if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
143
+ raise ValueError(
144
+ f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
145
+ f"Current input length: {len(input_tokens)} tokens."
146
+ )
147
+
148
+ input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.model.device)
149
+
150
+ streamer = TokenStreamer()
151
+ generation_kwargs = {
152
+ "input_ids": input_ids,
153
+ "max_new_tokens": MAX_OUTPUT_TOKENS,
154
+ "eos_token_id": self.stop_token_ids,
155
+ "streamer": streamer,
156
+ }
157
+
158
+ # Start generation in a separate thread
159
+ thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
160
+ thread.start()
161
+
162
+ while True:
163
+ token_id = streamer.token_queue.get()
164
+ if token_id == streamer.stop_signal:
165
+ break
166
+ yield token_id
167
+
168
+ thread.join()
169
+
170
+
171
+ @app.local_entrypoint()
172
+ def main():
173
+ # https://cookbook.openai.com/articles/openai-harmony#harmony-renderer-library
174
+ convo = oh.Conversation.from_messages(
175
+ [
176
+ oh.Message.from_role_and_content(oh.Role.SYSTEM, oh.SystemContent.new()),
177
+ oh.Message.from_role_and_content(
178
+ oh.Role.DEVELOPER,
179
+ oh.DeveloperContent.new().with_instructions(
180
+ "Always respond in the same language as the user."
181
+ ),
182
+ ),
183
+ oh.Message.from_role_and_content(
184
+ oh.Role.USER, "Hi. How is the weather today?"
185
+ ),
186
+ ]
187
+ )
188
+
189
+ model = LLMModel()
190
+
191
+ encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
192
+ input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
193
+
194
+ print("AI: ", end="", flush=True)
195
+
196
+ parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
197
+
198
+ for token in model.generate_stream.remote_gen(input_tokens):
199
+ parser.process(token)
200
+ delta = parser.last_content_delta
201
+ if delta:
202
+ print(delta, end="", flush=True)
203
+ print()