#!/usr/bin/env python3 """ Qwen3-Omni GGUF格式使用範例 這個腳本展示如何使用GGUF格式的Qwen3-Omni模型進行各種任務, 包括Ollama API、llama-cpp-python直接調用等方法。 """ import json import time import requests import subprocess from pathlib import Path from typing import Dict, List, Optional try: from llama_cpp import Llama LLAMA_CPP_AVAILABLE = True except ImportError: LLAMA_CPP_AVAILABLE = False print("⚠️ llama-cpp-python not installed. Install with: pip install llama-cpp-python") class QwenGGUFRunner: """Qwen GGUF格式運行器""" def __init__(self, model_path: str = "qwen3_omni_quantized.gguf"): self.model_path = model_path self.llm = None def load_with_llama_cpp(self, **kwargs): """使用llama-cpp-python載入模型""" if not LLAMA_CPP_AVAILABLE: raise ImportError("llama-cpp-python not available") default_params = { 'n_gpu_layers': 35, # GPU加速層數 'n_ctx': 4096, # 上下文長度 'n_batch': 512, # 批次大小 'verbose': False, # 靜音模式 'n_threads': 8, # CPU線程數 } default_params.update(kwargs) print(f"🚀 Loading GGUF model: {self.model_path}") start_time = time.time() self.llm = Llama(model_path=self.model_path, **default_params) load_time = time.time() - start_time print(f"✅ Model loaded in {load_time:.2f}s") return self.llm def generate_with_llama_cpp(self, prompt: str, **kwargs) -> str: """使用llama-cpp-python生成文本""" if not self.llm: raise ValueError("Model not loaded. Call load_with_llama_cpp() first.") default_params = { 'max_tokens': 256, 'temperature': 0.7, 'top_p': 0.8, 'top_k': 50, 'repeat_penalty': 1.1, 'stop': ["", "<|endoftext|>"] } default_params.update(kwargs) print(f"💭 Generating response...") start_time = time.time() response = self.llm(prompt, **default_params) gen_time = time.time() - start_time tokens = len(response['choices'][0]['text'].split()) speed = tokens / gen_time if gen_time > 0 else 0 print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)") return response['choices'][0]['text'] class OllamaAPI: """Ollama API 接口""" def __init__(self, base_url: str = "http://localhost:11434"): self.base_url = base_url self.model_name = "qwen3-omni-quantized" def check_connection(self) -> bool: """檢查Ollama連接""" try: response = requests.get(f"{self.base_url}/api/tags", timeout=5) return response.status_code == 200 except: return False def is_model_available(self) -> bool: """檢查模型是否可用""" try: response = requests.get(f"{self.base_url}/api/tags") models = response.json().get("models", []) return any(model["name"] == self.model_name for model in models) except: return False def generate(self, prompt: str, **kwargs) -> str: """使用Ollama API生成文本""" if not self.check_connection(): raise ConnectionError("Cannot connect to Ollama API") if not self.is_model_available(): raise ValueError(f"Model {self.model_name} not found in Ollama") payload = { "model": self.model_name, "prompt": prompt, "stream": False, "options": { "temperature": kwargs.get("temperature", 0.7), "top_p": kwargs.get("top_p", 0.8), "top_k": kwargs.get("top_k", 50), "repeat_penalty": kwargs.get("repeat_penalty", 1.1), "num_predict": kwargs.get("max_tokens", 256), } } print(f"💭 Sending request to Ollama...") start_time = time.time() response = requests.post( f"{self.base_url}/api/generate", json=payload, timeout=60 ) if response.status_code != 200: raise RuntimeError(f"Ollama API error: {response.text}") result = response.json() gen_time = time.time() - start_time # 估算tokens和速度 output_text = result["response"] tokens = len(output_text.split()) speed = tokens / gen_time if gen_time > 0 else 0 print(f"⚡ Generated {tokens} tokens in {gen_time:.2f}s ({speed:.1f} tok/s)") return output_text def run_examples(): """運行示例代碼""" examples = [ { "name": "🌟 創意寫作", "prompt": "請寫一個關於AI和人類合作探索宇宙的短故事,要有科幻感和哲理思考。", "params": {"temperature": 0.8, "max_tokens": 400} }, { "name": "💻 代碼生成", "prompt": "請用Python寫一個快速排序算法,包含詳細註解和時間複雜度分析。", "params": {"temperature": 0.3, "max_tokens": 500} }, { "name": "🧮 數學推理", "prompt": "一個圓的半徑是5cm,請計算其面積和周長,並解釋計算過程。", "params": {"temperature": 0.2, "max_tokens": 300} }, { "name": "🌐 多語言翻譯", "prompt": "Please translate this English text to Chinese: 'Artificial Intelligence is revolutionizing the way we interact with technology, making it more intuitive and human-friendly.'", "params": {"temperature": 0.3, "max_tokens": 200} }, { "name": "🤔 邏輯推理", "prompt": "如果所有的A都是B,所有的B都是C,而某個X是A,那麼X是什麼?請解釋邏輯推理過程。", "params": {"temperature": 0.1, "max_tokens": 250} } ] # 檢查Ollama可用性 ollama = OllamaAPI() ollama_available = ollama.check_connection() and ollama.is_model_available() # 檢查GGUF文件可用性 gguf_available = LLAMA_CPP_AVAILABLE and Path("qwen3_omni_quantized.gguf").exists() print("=" * 80) print("🔥 Qwen3-Omni GGUF格式使用範例") print("=" * 80) print(f"💾 Ollama API 可用: {'✅' if ollama_available else '❌'}") print(f"📁 GGUF文件可用: {'✅' if gguf_available else '❌'}") print() # 如果都不可用,提供設置指南 if not ollama_available and not gguf_available: print("⚠️ 請先設置Ollama或下載GGUF文件:") print() print("🚀 Ollama 設置:") print(" 1. ollama create qwen3-omni-quantized -f Qwen3OmniQuantized.modelfile") print(" 2. ollama serve") print() print("📁 GGUF文件下載:") print(" huggingface-cli download vito95311/Qwen3-Omni-30B-A3B-Thinking-GGUF-INT8FP16 qwen3_omni_quantized.gguf") return # 優先使用Ollama,因為更簡單 if ollama_available: print("🎯 使用Ollama API進行推理") runner_type = "ollama" api = ollama else: print("🎯 使用llama-cpp-python進行推理") runner_type = "llama_cpp" runner = QwenGGUFRunner() runner.load_with_llama_cpp() print("=" * 80) # 運行示例 for i, example in enumerate(examples, 1): print(f"\n📝 示例 {i}: {example['name']}") print(f"💬 提示: {example['prompt'][:100]}...") print("-" * 40) try: if runner_type == "ollama": response = api.generate(example['prompt'], **example['params']) else: response = runner.generate_with_llama_cpp(example['prompt'], **example['params']) print(f"🤖 回應: {response.strip()}") except Exception as e: print(f"❌ 錯誤: {str(e)}") print("-" * 40) # 暫停一下避免過載 time.sleep(1) def benchmark_performance(): """性能基準測試""" print("\n🏆 性能基準測試") print("=" * 50) test_prompts = [ "解釋什麼是機器學習", "寫一個Python函數來計算斐波那契數列", "描述量子計算的基本原理", "What are the benefits of renewable energy?", "如何優化深度學習模型的性能?" ] ollama = OllamaAPI() if ollama.check_connection() and ollama.is_model_available(): print("📊 測試Ollama API性能...") total_time = 0 total_tokens = 0 for i, prompt in enumerate(test_prompts, 1): print(f" Test {i}/5: ", end="", flush=True) start_time = time.time() response = ollama.generate(prompt, max_tokens=100, temperature=0.7) end_time = time.time() test_time = end_time - start_time tokens = len(response.split()) speed = tokens / test_time if test_time > 0 else 0 total_time += test_time total_tokens += tokens print(f"{speed:.1f} tok/s") avg_speed = total_tokens / total_time if total_time > 0 else 0 print(f"\n📈 平均性能: {avg_speed:.1f} tokens/秒") print(f"⏱️ 總時間: {total_time:.2f}秒") print(f"📝 總tokens: {total_tokens}") else: print("⚠️ Ollama不可用,跳過性能測試") def main(): """主函數""" print("🔥 Qwen3-Omni GGUF 使用範例") print("這個腳本展示如何使用GGUF格式的模型進行各種AI任務") # 運行使用範例 run_examples() # 性能測試 user_input = input("\n🤔 是否運行性能基準測試? (y/n): ") if user_input.lower() in ['y', 'yes']: benchmark_performance() print("\n✨ 示例運行完成!") print("💡 更多使用方法請參考 README.md") if __name__ == "__main__": main()