""" Gradio Web UI for HunyuanOCR Text Spotting Upload an image and get text detection with bounding boxes """ import gradio as gr from PIL import Image import os # Set environment variable to avoid tokenizer parallelism deadlocks os.environ["TOKENIZERS_PARALLELISM"] = "false" import spaces from ocr_model import HunyuanOCR from visualization import draw_detection_boxes, get_detection_summary from dotenv import load_dotenv from openai import OpenAI from huggingface_hub import hf_hub_download # Load environment variables load_dotenv() # Global model instance (loaded once) ocr_model = None def download_font(): """Download font from Hugging Face Hub if not exists""" font_dir = os.path.join(os.path.dirname(__file__), "fonts") os.makedirs(font_dir, exist_ok=True) font_path = os.path.join(font_dir, "NotoSansCJK-Light.ttc") if not os.path.exists(font_path): print("Downloading font from Hugging Face Hub...") try: hf_hub_download( repo_id="jzhang533/fonts", filename="NotoSansCJK-Light.ttc", repo_type="dataset", local_dir=font_dir, local_dir_use_symlinks=False ) print("Font downloaded successfully!") except Exception as e: print(f"Failed to download font: {e}") def initialize_model(): """Initialize the OCR model (called once at startup)""" global ocr_model if ocr_model is None: # Ensure font is available download_font() print("Initializing HunyuanOCR model...") ocr_model = HunyuanOCR() print("Model ready!") return ocr_model @spaces.GPU def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"): """ Process uploaded image and return annotated result Args: image: PIL Image from Gradio prompt: Optional custom prompt target_language: Target language for translation (Original, Chinese, English, French, etc.) Returns: Tuple of (annotated_image, detection_summary, raw_response) """ if image is None: return None, "Please upload an image first.", "" try: # Initialize model if needed model = initialize_model() # Resize image if height > 960 while maintaining aspect ratio if image.height > 960: aspect_ratio = image.width / image.height new_height = 960 new_width = int(new_height * aspect_ratio) print(f"Resizing image from {image.size} to ({new_width}, {new_height})") image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) # Get image dimensions image_width, image_height = image.size # Use default prompt if not provided if not prompt or prompt.strip() == "": prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。" # Detect text print("Running text detection...") response = model.detect_text(image, prompt) # Parse results detections = model.parse_detection_results(response, image_width, image_height) # Merge detections first (since visualization does it internally, we need to do it here for translation) from visualization import merge_detections merged_detections = merge_detections(detections) # Translate text in merged detections if not "Original" if target_language != "Original": print(f"Translating text to {target_language}...") for det in merged_detections: original_text = det['text'] translated = translate_text(original_text, target_language) det['original_text'] = original_text det['text'] = translated print(f"Translated: {original_text[:20]}... -> {translated[:20]}...") else: print("Skipping translation (Original selected)") # Draw boxes on image (pass merged detections and disable internal merging) annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False) # Create summary summary = get_detection_summary(merged_detections) print(f"Detected {len(detections)} text regions") return annotated_image, summary, response except Exception as e: error_msg = f"Error processing image: {str(e)}" print(error_msg) return None, error_msg, "" def translate_text(text: str, target_language: str = "Chinese") -> str: """ Translate text to target language using model specified in .env via OpenAI-compatible API """ try: api_key = os.getenv("MODEL_ACCESS_TOKEN") base_url = os.getenv("MODEL_API_URL") model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback if not api_key or not base_url: print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env") return text client = OpenAI(api_key=api_key, base_url=base_url) system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations." response = client.chat.completions.create( model=model_name, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": text} ] ) return response.choices[0].message.content.strip() except Exception as e: print(f"Translation error: {e}") return text def create_demo(): """Create and configure the Gradio interface""" with gr.Blocks(title="AI Manga Translator") as demo: gr.Markdown(""" # 📚 AI Manga Translator An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics. **Key Capabilities:** - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text. - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly. - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.). - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds. """) with gr.Row(): with gr.Column(scale=1): # Input section gr.Markdown("### 📤 Input") input_image = gr.Image( type="pil", label="Upload Image", sources=["upload", "clipboard"] ) custom_prompt = gr.Textbox( label="Custom Prompt (Optional)", placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。", lines=2 ) target_lang = gr.Dropdown( choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"], value="Chinese", label="Target Language", info="Select language for translation (Original = no translation)" ) detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg") with gr.Column(scale=1): # Output section gr.Markdown("### 📊 Results") output_image = gr.Image( type="pil", label="Detected Text with Bounding Boxes" ) detection_summary = gr.Textbox( label="Detection Summary", lines=10, max_lines=20 ) with gr.Accordion("Raw Model Response", open=False): raw_output = gr.Textbox(label="Raw Output", lines=5) # Connect the button detect_btn.click( fn=process_image, inputs=[input_image, custom_prompt, target_lang], outputs=[output_image, detection_summary, raw_output] ) # Examples gr.Markdown("### 📝 Examples") gr.Examples( examples=[ ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"], ], inputs=[input_image, custom_prompt], label="Click to use example image" ) gr.Markdown(""" --- ### ℹ️ About This application combines state-of-the-art AI technologies to provide seamless manga translation: - **OCR Engine**: HunyuanOCR. - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations. - **Development**: Vibe coded with **Gemini 3 Pro**. """) return demo if __name__ == "__main__": # Create and launch the demo print("Loading model (this may take a minute on first run)...") demo = create_demo() # Launch with public link option demo.launch( server_name="0.0.0.0", share=False, # Set to True to create a public link show_error=True, ssr_mode=False )