Spaces:

ewssbd
/

llama-321b

Sleeping

App Files Files Community

ewssbd commited on Oct 27

Commit

1b32362

verified ·

1 Parent(s): 7ae4e4a

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -37

app.py CHANGED Viewed

@@ -1,55 +1,62 @@
-import os
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import hf_hub_download
 import gradio as gr
 # ---------------------------
-# Step 1: Define model info
 # ---------------------------
-REPO_ID = "amusktweewt/tiny-model-500M-chat-v2"
-CACHE_DIR = "/tmp"  # store model temporarily
-# ---------------------------
-# Step 2: Download model at runtime
-# ---------------------------
-print("===== Application Startup =====")
-print("Checking for model...")
-model_path = os.path.join(CACHE_DIR, REPO_ID.replace("/", "_"))
-if not os.path.exists(model_path):
-    print("Downloading model...")
-    model_path = hf_hub_download(repo_id=REPO_ID, cache_dir=CACHE_DIR)
-    print("Download completed!")
-else:
-    print("Model already cached!")
 # ---------------------------
-# Step 3: Load model and tokenizer
 # ---------------------------
-print("Loading model and tokenizer...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
-model = AutoModelForCausalLM.from_pretrained(REPO_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32).to(device)
-# ---------------------------
-# Step 4: Define prediction function
-# ---------------------------
-def predict(prompt):
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    outputs = model.generate(**inputs, max_length=200, temperature=0.7)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response
 # ---------------------------
-# Step 5: Gradio interface
 # ---------------------------
 iface = gr.Interface(
-    fn=predict,
-    inputs=gr.Textbox(label="User Message"),
-    outputs=gr.Textbox(label="AI Response"),
-    title="Tiny Model 500M Chat v2",
-    description="A lightweight AI chat model that runs free on CPU."
 )
 if __name__ == "__main__":

 import torch
+from transformers import pipeline, set_seed
 import gradio as gr
 # ---------------------------
+# Model setup
 # ---------------------------
+MODEL_NAME = "amusktweewt/tiny-model-500M-chat-v2"
+print("Downloading and loading model...")
+chatbot = pipeline(
+    "text-generation",
+    model=MODEL_NAME,
+    device=0 if torch.cuda.is_available() else -1
+)
+set_seed(42)
+print("✅ Chatbot is ready!")
 # ---------------------------
+# Chat prediction
 # ---------------------------
+def chat_with_model(user_input):
+    if not user_input.strip():
+        return "Please enter a message."
+    messages = [
+        {"role": "user", "content": user_input},
+        {"role": "assistant", "content": ""}
+    ]
+    prompt = chatbot.tokenizer.apply_chat_template(messages, tokenize=False)
+    response = chatbot(
+        prompt,
+        do_sample=True,
+        max_new_tokens=256,
+        top_k=50,
+        temperature=0.2,
+        num_return_sequences=1,
+        repetition_penalty=1.1,
+        pad_token_id=chatbot.tokenizer.eos_token_id,
+        min_new_tokens=0
+    )
+    full_text = response[0]["generated_text"]
+    bot_response = full_text[len(prompt):].strip()
+    return bot_response
 # ---------------------------
+# Gradio interface
 # ---------------------------
 iface = gr.Interface(
+    fn=chat_with_model,
+    inputs=gr.Textbox(label="Enter your message"),
+    outputs=gr.Textbox(label="AI Reply"),
+    title="Tiny Chatbot 500M",
+    description="Lightweight chat model under 500MB, ideal for free Hugging Face CPU Spaces or n8n message handling."
 )
 if __name__ == "__main__":