import gradio as gr from transformers import pipeline import torch import threading import time # Configuration MODEL_NAME = "trendmicro-ailab/Llama-Primus-Reasoning" class FastModelHandler: def __init__(self): self.pipe = None self.loading = False self.loaded = False self.start_background_loading() def start_background_loading(self): """Model ko background mein automatically load kare""" def load_model(): self.loading = True try: print("šŸ”„ Model automatically loading...") # Fast loading ke liye pipeline use kare self.pipe = pipeline( "text-generation", model=MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) self.loaded = True self.loading = False print("āœ… Model loaded successfully!") except Exception as e: print(f"āŒ Loading error: {e}") # Fallback: CPU pe load kare try: self.pipe = pipeline( "text-generation", model=MODEL_NAME, device_map="cpu" ) self.loaded = True self.loading = False print("āœ… Model loaded on CPU!") except Exception as e2: print(f"āŒ CPU loading failed: {e2}") # Background thread start kare thread = threading.Thread(target=load_model, daemon=True) thread.start() def generate_fast(self, prompt, max_length=200): """Fast response generate kare""" if not self.loaded: if self.loading: return "ā³ Model abhi load ho raha hai... 1-2 minute wait karein" else: return "āŒ Model load nahi ho paya. Page refresh karein" try: # Fast generation start_time = time.time() result = self.pipe( prompt, max_length=max_length, temperature=0.7, do_sample=True, num_return_sequences=1, pad_token_id=50256, repetition_penalty=1.1 ) generation_time = time.time() - start_time response = result[0]['generated_text'] return f"{response}\n\n⚔ Generation time: {generation_time:.1f}s" except Exception as e: return f"āŒ Generation error: {str(e)}" # Model automatically initialize ho jayega model_handler = FastModelHandler() # Simple interface - sirf prompt do, response mil jaye with gr.Blocks(theme=gr.themes.Soft(), title="Fast Reasoning Model") as demo: gr.Markdown(""" # šŸš€ Fast Reasoning Model **Model automatically load ho gaya hai!** šŸ‘‡ **Bas prompt likho aur Generate dabao - jaldi response mil jayega!** """) # Real-time status display status_display = gr.HTML( value="
šŸ”„ Model loading... Thoda wait karein
", every=5 # Har 5 second mein update ) def update_status(): """Real-time status update kare""" if model_handler.loaded: return "
āœ… Model loaded! Ab prompt likh sakte hain
" elif model_handler.loading: return "
šŸ”„ Model loading... 1-2 minute
" else: return "
āŒ Model loading failed. Refresh karein
" with gr.Row(): with gr.Column(): prompt_input = gr.Textbox( label="Your Prompt", placeholder="Yahan apna sawal likho...", lines=3, max_lines=3 ) with gr.Row(): length_slider = gr.Slider( minimum=50, maximum=300, value=150, label="Response Length" ) generate_btn = gr.Button("⚔ Generate Fast", variant="primary") with gr.Column(): output_box = gr.Textbox( label="Model Response", lines=6, show_copy_button=True ) # Quick examples - click karo, auto fill ho jaye gr.Examples( examples=[ ["15 + 27 Ɨ 3 kaise solve karein?"], ["Socrates insaan hai, sab insaan mortal hain. Socrates?"], ["Train 60 mph se 2 hours chale to kitna distance?"], ["Agar barish ho to zameen geeli hoti hai. Zameen geeli hai to kya barish ho rahi hai?"], ["Simple reasoning example batayein"] ], inputs=prompt_input, label="Quick Examples - Click karo" ) # Event handlers generate_btn.click( fn=model_handler.generate_fast, inputs=[prompt_input, length_slider], outputs=output_box ) # Auto status update status_display.change( fn=update_status, outputs=status_display, every=5000 # 5 seconds ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )