Spaces:

MCP-1st-Birthday
/

unpredictable-lord

Running

App Files Files Community

ryomo commited on 19 days ago

Commit

0ddba75

1 Parent(s): f0226f8

refactor: switch llm_modal generate_stream implementation from class-based to function-based to align with llm_zerogpu.py

Browse files

Files changed (3) hide show

pyproject.toml +2 -0
src/unpredictable_lord/chat.py +7 -10
src/unpredictable_lord/llm_modal.py +69 -60

pyproject.toml CHANGED Viewed

@@ -31,3 +31,5 @@ build-backend = "hatchling.build"
 [tool.poe.tasks]
 gradio = "gradio app.py"

 [tool.poe.tasks]
 gradio = "gradio app.py"
+modal-deploy = "uv run modal deploy src/unpredictable_lord/llm_modal.py"
+modal-run = "uv run modal run src/unpredictable_lord/llm_modal.py"

src/unpredictable_lord/chat.py CHANGED Viewed

@@ -21,18 +21,15 @@ if USE_MODAL:
     import modal
     APP_NAME = "unpredictable-lord"
-    LLMModel = modal.Cls.from_name(APP_NAME, "LLMModel")
-    model = LLMModel()
-    def _generate_stream(input_tokens):
-        return model.generate_stream.remote_gen(input_tokens)
 else:
-    from unpredictable_lord.llm_zerogpu import (
-        generate_stream as generate_stream_zerogpu,
-    )
-    def _generate_stream(input_tokens):
-        return generate_stream_zerogpu(input_tokens)
 def chat_with_llm_stream(
@@ -100,7 +97,7 @@ def chat_with_llm_stream(
         ]
         # Streaming generation
-        generater = _generate_stream(input_tokens)
         response_text = ""
         for token in generater:

     import modal
     APP_NAME = "unpredictable-lord"
+    _generate_stream = modal.Function.from_name(APP_NAME, "generate_stream")
+    def generate_stream(input_tokens):
+        return _generate_stream.remote_gen(input_tokens)
 else:
+    from unpredictable_lord.llm_zerogpu import generate_stream as _generate_stream
+    def generate_stream(input_tokens):
+        return _generate_stream(input_tokens)
 def chat_with_llm_stream(
         ]
         # Streaming generation
+        generater = generate_stream(input_tokens)
         response_text = ""
         for token in generater:

src/unpredictable_lord/llm_modal.py CHANGED Viewed

@@ -61,81 +61,92 @@ image = (
             "HF_HOME": MOUNT_DIR + "/huggingface",
         }
     )
 )
 app = modal.App(APP_NAME, image=image)
-# NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
-# https://modal.com/docs/guide/lifecycle-functions
-@app.cls(
     gpu=GPU,
-    image=image,
     volumes={MOUNT_DIR: MOUNT_VOLUME},
     # secrets=[modal.Secret.from_name("huggingface-secret")],
     # scaledown_window=15 * 60,
     # timeout=30 * 60,
 )
-class LLMModel:
-    @modal.enter()
-    def setup(self):
-        # Ensure the cache volume is the latest
-        MOUNT_VOLUME.reload()
-        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            MODEL_IDENTIFIER,
-            dtype="auto",
-            device_map="auto",
         )
-        # Commit the volume to ensure the model is saved
-        MOUNT_VOLUME.commit()
-        self.encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
-        self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()
-        # Show GPU information
-        subprocess.run(["nvidia-smi"])
-    @modal.method()
-    def generate_stream(self, input_tokens, _=None):
-        """
-        Generate a streaming response
-        Args:
-            input_tokens (list[int]): Input token IDs
-            _ : Dummy parameter for compatibility
-        """
-        if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
-            raise ValueError(
-                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
-                f"Current input length: {len(input_tokens)} tokens."
-            )
-        input_ids = torch.tensor([input_tokens], dtype=torch.long).to(self.model.device)
-        streamer = TokenStreamer()
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "max_new_tokens": MAX_OUTPUT_TOKENS,
-            "eos_token_id": self.stop_token_ids,
-            "streamer": streamer,
-        }
-        # Start generation in a separate thread
-        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
-        thread.start()
-        while True:
-            token_id = streamer.token_queue.get()
-            if token_id == streamer.stop_signal:
-                break
-            yield token_id
-        thread.join()
 @app.local_entrypoint()
@@ -156,8 +167,6 @@ def main():
         ]
     )
-    model = LLMModel()
     encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
     input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
@@ -165,7 +174,7 @@ def main():
     parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
-    for token in model.generate_stream.remote_gen(input_tokens):
         parser.process(token)
         delta = parser.last_content_delta
         if delta:

             "HF_HOME": MOUNT_DIR + "/huggingface",
         }
     )
+    .add_local_python_source("unpredictable_lord")  # Include local package
 )
 app = modal.App(APP_NAME, image=image)
+# Global model and tokenizer (loaded once per container)
+model = None
+tokenizer = None
+stop_token_ids = None
+def load_model():
+    """Load model and tokenizer into global variables."""
+    global model, tokenizer, stop_token_ids
+    if model is not None:
+        return
+    # Ensure the cache volume is the latest
+    MOUNT_VOLUME.reload()
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_IDENTIFIER,
+        torch_dtype="auto",
+        device_map="auto",
+    )
+    # Commit the volume to ensure the model is saved
+    MOUNT_VOLUME.commit()
+    # Load stop token IDs
+    _encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+    stop_token_ids = _encoding.stop_tokens_for_assistant_actions()
+    # Show GPU information
+    subprocess.run(["nvidia-smi"])
+@app.function(
     gpu=GPU,
     volumes={MOUNT_DIR: MOUNT_VOLUME},
     # secrets=[modal.Secret.from_name("huggingface-secret")],
     # scaledown_window=15 * 60,
     # timeout=30 * 60,
 )
+def generate_stream(input_tokens):
+    """
+    Generate a streaming response
+    Args:
+        input_tokens (list[int]): Input token IDs
+    Yields:
+        int: Generated token IDs
+    """
+    load_model()
+    if len(input_tokens) + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
+        raise ValueError(
+            f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
+            f"Current input length: {len(input_tokens)} tokens."
         )
+    input_ids = torch.tensor([input_tokens], dtype=torch.long).to(model.device)
+    streamer = TokenStreamer()
+    generation_kwargs = {
+        "input_ids": input_ids,
+        "max_new_tokens": MAX_OUTPUT_TOKENS,
+        "eos_token_id": stop_token_ids,
+        "streamer": streamer,
+    }
+    # Start generation in a separate thread
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    while True:
+        token_id = streamer.token_queue.get()
+        if token_id == streamer.stop_signal:
+            break
+        yield token_id
+    thread.join()
 @app.local_entrypoint()
         ]
     )
     encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
     input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
     parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
+    for token in generate_stream.remote_gen(input_tokens):
         parser.process(token)
         delta = parser.last_content_delta
         if delta: