ryomo's picture
refactor: switch llm_modal generate_stream implementation from class-based to function-based to align with llm_zerogpu.py
0ddba75
raw
history blame
4.15 kB
"""
AI chat functionality implementation
Provides chat functionality by calling the LLM endpoint on Modal or using ZeroGPU.
"""
import logging
import os
import openai_harmony as oh
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger(__name__)
USE_MODAL = os.getenv("USE_MODAL", "false").lower() == "true"
if USE_MODAL:
import modal
APP_NAME = "unpredictable-lord"
_generate_stream = modal.Function.from_name(APP_NAME, "generate_stream")
def generate_stream(input_tokens):
return _generate_stream.remote_gen(input_tokens)
else:
from unpredictable_lord.llm_zerogpu import generate_stream as _generate_stream
def generate_stream(input_tokens):
return _generate_stream(input_tokens)
def chat_with_llm_stream(
user_message: str,
chat_history: list[dict[str, str]],
):
"""
Chat with LLM (streaming version)
Args:
user_message: User's message
chat_history: Past chat history (list of dictionaries in Gradio format)
[{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
Yields:
updated_chat_history: Updated chat history (Gradio format)
"""
try:
# 1. Build message list for LLM request (openai-harmony format)
messages = []
# System message
system_content = (
oh.SystemContent.new()
.with_model_identity(
"You are a lord of a medieval fantasy kingdom. The user is your advisor."
"Listen to your advisor's advice and act for the development of your territory and the maintenance of your authority."
"Speak in an arrogant tone."
)
.with_reasoning_effort(oh.ReasoningEffort.LOW)
# .with_conversation_start_date("2025-11-21")
)
messages.append(
oh.Message.from_role_and_content(oh.Role.SYSTEM, system_content)
)
# Convert past history to openai-harmony format and add
for msg in chat_history:
if msg["role"] == "user":
messages.append(
oh.Message.from_role_and_content(oh.Role.USER, msg["content"])
)
elif msg["role"] == "assistant":
messages.append(
oh.Message.from_role_and_content(oh.Role.ASSISTANT, msg["content"])
)
# Add current user message
messages.append(oh.Message.from_role_and_content(oh.Role.USER, user_message))
# Encode message
convo = oh.Conversation.from_messages(messages)
encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
input_tokens = encoding.render_conversation_for_completion(
convo, oh.Role.ASSISTANT
)
parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
# 2. Build history list for UI display (Gradio format)
# Add user message and empty assistant message to be generated
partial_history = chat_history + [
{"role": "user", "content": user_message},
{"role": "assistant", "content": ""},
]
# Streaming generation
generater = generate_stream(input_tokens)
response_text = ""
for token in generater:
if token is None:
continue
parser.process(token)
# Get content only from final channel
if parser.current_channel == "final":
delta = parser.last_content_delta
if delta:
response_text += delta
# Update history and yield
partial_history[-1]["content"] = response_text
yield partial_history
except Exception:
logger.exception("Error during chat_with_llm_stream")
updated_history = chat_history + [
{"role": "user", "content": user_message},
{
"role": "assistant",
"content": "[Error occurred while generating response.]",
},
]
yield updated_history