#!/usr/bin/env python3
"""
Qwen3-Omni GGUF格式使用範例

這個腳本展示如何使用GGUF格式的Qwen3-Omni模型進行各種任務，
包括Ollama API、llama-cpp-python直接調用等方法。
"""

import json
import time
import requests
import subprocess
from pathlib import Path
from typing import Dict, List, Optional

try:
    from llama_cpp import Llama
    LLAMA_CPP_AVAILABLE = True
except ImportError:
    LLAMA_CPP_AVAILABLE = False
    print("⚠️  llama-cpp-python not installed. Install with: pip install llama-cpp-python")


class QwenGGUFRunner:
    """Qwen GGUF格式運行器"""
    
    def __init__(self, model_path: str = "qwen3_omni_quantized.gguf"):
        self.model_path = model_path
        self.llm = None
        
    def load_with_llama_cpp(self, **kwargs):
        """使用llama-cpp-python載入模型"""
        if not LLAMA_CPP_AVAILABLE:
            raise ImportError("llama-cpp-python not available")
            
        default_params = {
            'n_gpu_layers': 35,  # GPU加速層數
            'n_ctx': 4096,       # 上下文長度
            'n_batch': 512,      # 批次大小
            'verbose': False,    # 靜音模式
            'n_threads': 8,      # CPU線程數
        }
        default_params.update(kwargs)
        
        print(f"🚀 Loading GGUF model: {self.model_path}")
        start_time = time.time()
        
        self.llm = Llama(model_path=self.model_path, **default_params)
        
        load_time = time.time() - start_time
        print(f"✅ Model loaded in {load_time:.2f}s")
        return self.llm
    
    def generate_with_llama_cpp(self, prompt: str, **kwargs) -> str:
        """使用llama-cpp-python生成文本"""
        if not self.llm:
            raise ValueError("Model not loaded. Call load_with_llama_cpp() first.")
        
        default_params = {
            'max_tokens': 256,
            'temperature': 0.7,
            'top_p': 0.8,
            'top_k': 50,
            'repeat_penalty': 1.1,
            'stop': ["</s>", "<|endoftext|>"]
        }
        default_params.update(kwargs)
        
        print(f"💭 Generating response...")
        start_time = time.time()
        
        response = self.llm(prompt, **default_params)
        
        gen_time = time.time() - start_time
        tokens = len(response['choices'][0]['text'].split())
        speed = tokens / gen_time if gen_time > 0 else 0
        
        print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)")
        
        return response['choices'][0]['text']


class OllamaAPI:
    """Ollama API 接口"""
    
    def __init__(self, base_url: str = "http://localhost:11434"):
        self.base_url = base_url
        self.model_name = "qwen3-omni-quantized"
    
    def check_connection(self) -> bool:
        """檢查Ollama連接"""
        try:
            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def is_model_available(self) -> bool:
        """檢查模型是否可用"""
        try:
            response = requests.get(f"{self.base_url}/api/tags")
            models = response.json().get("models", [])
            return any(model["name"] == self.model_name for model in models)
        except:
            return False
    
    def generate(self, prompt: str, **kwargs) -> str:
        """使用Ollama API生成文本"""
        if not self.check_connection():
            raise ConnectionError("Cannot connect to Ollama API")
        
        if not self.is_model_available():
            raise ValueError(f"Model {self.model_name} not found in Ollama")
        
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": kwargs.get("temperature", 0.7),
                "top_p": kwargs.get("top_p", 0.8),
                "top_k": kwargs.get("top_k", 50),
                "repeat_penalty": kwargs.get("repeat_penalty", 1.1),
                "num_predict": kwargs.get("max_tokens", 256),
            }
        }
        
        print(f"💭 Sending request to Ollama...")
        start_time = time.time()
        
        response = requests.post(
            f"{self.base_url}/api/generate", 
            json=payload,
            timeout=60
        )
        
        if response.status_code != 200:
            raise RuntimeError(f"Ollama API error: {response.text}")
        
        result = response.json()
        gen_time = time.time() - start_time
        
        # 估算tokens和速度
        output_text = result["response"]
        tokens = len(output_text.split())
        speed = tokens / gen_time if gen_time > 0 else 0
        
        print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)")
        
        return output_text


def run_examples():
    """運行示例代碼"""
    
    examples = [
        {
            "name": "🌟 創意寫作",
            "prompt": "請寫一個關於AI和人類合作探索宇宙的短故事，要有科幻感和哲理思考。",
            "params": {"temperature": 0.8, "max_tokens": 400}
        },
        {
            "name": "💻 代碼生成",
            "prompt": "請用Python寫一個快速排序算法，包含詳細註解和時間複雜度分析。",
            "params": {"temperature": 0.3, "max_tokens": 500}
        },
        {
            "name": "🧮 數學推理",
            "prompt": "一個圓的半徑是5cm，請計算其面積和周長，並解釋計算過程。",
            "params": {"temperature": 0.2, "max_tokens": 300}
        },
        {
            "name": "🌐 多語言翻譯",
            "prompt": "Please translate this English text to Chinese: 'Artificial Intelligence is revolutionizing the way we interact with technology, making it more intuitive and human-friendly.'",
            "params": {"temperature": 0.3, "max_tokens": 200}
        },
        {
            "name": "🤔 邏輯推理",
            "prompt": "如果所有的A都是B，所有的B都是C，而某個X是A，那麼X是什麼？請解釋邏輯推理過程。",
            "params": {"temperature": 0.1, "max_tokens": 250}
        }
    ]
    
    # 檢查Ollama可用性
    ollama = OllamaAPI()
    ollama_available = ollama.check_connection() and ollama.is_model_available()
    
    # 檢查GGUF文件可用性
    gguf_available = LLAMA_CPP_AVAILABLE and Path("qwen3_omni_quantized.gguf").exists()
    
    print("=" * 80)
    print("🔥 Qwen3-Omni GGUF格式使用範例")
    print("=" * 80)
    print(f"💾 Ollama API 可用: {'✅' if ollama_available else '❌'}")
    print(f"📁 GGUF文件可用: {'✅' if gguf_available else '❌'}")
    print()
    
    # 如果都不可用，提供設置指南
    if not ollama_available and not gguf_available:
        print("⚠️  請先設置Ollama或下載GGUF文件:")
        print()
        print("🚀 Ollama 設置:")
        print("   1. ollama create qwen3-omni-quantized -f Qwen3OmniQuantized.modelfile")
        print("   2. ollama serve")
        print()
        print("📁 GGUF文件下載:")
        print("   huggingface-cli download vito95311/Qwen3-Omni-30B-A3B-Thinking-GGUF-INT8FP16 qwen3_omni_quantized.gguf")
        return
    
    # 優先使用Ollama，因為更簡單
    if ollama_available:
        print("🎯 使用Ollama API進行推理")
        runner_type = "ollama"
        api = ollama
    else:
        print("🎯 使用llama-cpp-python進行推理")
        runner_type = "llama_cpp"
        runner = QwenGGUFRunner()
        runner.load_with_llama_cpp()
    
    print("=" * 80)
    
    # 運行示例
    for i, example in enumerate(examples, 1):
        print(f"\n📝 示例 {i}: {example['name']}")
        print(f"💬 提示: {example['prompt'][:100]}...")
        print("-" * 40)
        
        try:
            if runner_type == "ollama":
                response = api.generate(example['prompt'], **example['params'])
            else:
                response = runner.generate_with_llama_cpp(example['prompt'], **example['params'])
            
            print(f"🤖 回應: {response.strip()}")
            
        except Exception as e:
            print(f"❌ 錯誤: {str(e)}")
        
        print("-" * 40)
        
        # 暫停一下避免過載
        time.sleep(1)


def benchmark_performance():
    """性能基準測試"""
    
    print("\n🏆 性能基準測試")
    print("=" * 50)
    
    test_prompts = [
        "解釋什麼是機器學習",
        "寫一個Python函數來計算斐波那契數列",
        "描述量子計算的基本原理", 
        "What are the benefits of renewable energy?",
        "如何優化深度學習模型的性能？"
    ]
    
    ollama = OllamaAPI()
    
    if ollama.check_connection() and ollama.is_model_available():
        print("📊 測試Ollama API性能...")
        
        total_time = 0
        total_tokens = 0
        
        for i, prompt in enumerate(test_prompts, 1):
            print(f"  Test {i}/5: ", end="", flush=True)
            
            start_time = time.time()
            response = ollama.generate(prompt, max_tokens=100, temperature=0.7)
            end_time = time.time()
            
            test_time = end_time - start_time
            tokens = len(response.split())
            speed = tokens / test_time if test_time > 0 else 0
            
            total_time += test_time
            total_tokens += tokens
            
            print(f"{speed:.1f} tok/s")
        
        avg_speed = total_tokens / total_time if total_time > 0 else 0
        print(f"\n📈 平均性能: {avg_speed:.1f} tokens/秒")
        print(f"⏱️  總時間: {total_time:.2f}秒")
        print(f"📝 總tokens: {total_tokens}")
    
    else:
        print("⚠️  Ollama不可用，跳過性能測試")


def main():
    """主函數"""
    print("🔥 Qwen3-Omni GGUF 使用範例")
    print("這個腳本展示如何使用GGUF格式的模型進行各種AI任務")
    
    # 運行使用範例  
    run_examples()
    
    # 性能測試
    user_input = input("\n🤔 是否運行性能基準測試？ (y/n): ")
    if user_input.lower() in ['y', 'yes']:
        benchmark_performance()
    
    print("\n✨ 示例運行完成！")
    print("💡 更多使用方法請參考 README.md")


if __name__ == "__main__":
    main()