Spaces:

jzhang533
/

ai_manga_translator

Running on Zero

App Files Files Community

jzhang533 commited on 16 days ago

Commit

5b3defa

1 Parent(s): 0a9cd06

ai manga translator

Browse files

Signed-off-by: Zhang Jun <[email protected]>

Files changed (11) hide show

.env.example +3 -0
.gitattributes +3 -0
.gitignore +13 -0
README.md +20 -1
app.py +233 -0
examples/dandadan.png +3 -0
examples/ruridragon.png +3 -0
examples/spyfamily.png +3 -0
ocr_model.py +319 -0
requirements.txt +9 -0
visualization.py +385 -0

.env.example ADDED Viewed

	@@ -0,0 +1,3 @@

+MODEL_ACCESS_TOKEN=your_token_here
+MODEL_API_URL=your_api_url_here
+MODEL_NAME=your_model_name_here

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/dandadan.png filter=lfs diff=lfs merge=lfs -text
+examples/ruridragon.png filter=lfs diff=lfs merge=lfs -text
+examples/spyfamily.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+__pycache__/
+*.pyc
+.DS_Store
+venv/
+.venv/
+.ipynb_checkpoints/
+# Environment variables
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local

README.md CHANGED Viewed

@@ -10,4 +10,23 @@ pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: apache-2.0
 ---
+# 📚 AI Manga Translator
+An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
+**Key Capabilities:**
+- 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
+- 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
+- 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
+## Technologies
+- **OCR Engine**: HunyuanOCR
+- **Translation**: ERNIE 4.5 (via API)
+- **Development**: Vibe coded with Gemini 3 Pro
+## Setup
+To run this locally:
+1. Install dependencies: `pip install -r requirements.txt`
+2. Set up `.env`
+3. Run `python app.py`.

app.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+Gradio Web UI for HunyuanOCR Text Spotting
+Upload an image and get text detection with bounding boxes
+"""
+import gradio as gr
+from PIL import Image
+import os
+from ocr_model import HunyuanOCR
+from visualization import draw_detection_boxes, get_detection_summary
+from dotenv import load_dotenv
+from openai import OpenAI
+# Load environment variables
+load_dotenv()
+# Global model instance (loaded once)
+ocr_model = None
+def initialize_model():
+    """Initialize the OCR model (called once at startup)"""
+    global ocr_model
+    if ocr_model is None:
+        print("Initializing HunyuanOCR model...")
+        ocr_model = HunyuanOCR()
+        print("Model ready!")
+    return ocr_model
+def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
+    """
+    Process uploaded image and return annotated result
+    Args:
+        image: PIL Image from Gradio
+        prompt: Optional custom prompt
+        target_language: Target language for translation (Original, Chinese, English, French, etc.)
+    Returns:
+        Tuple of (annotated_image, detection_summary, raw_response)
+    """
+    if image is None:
+        return None, "Please upload an image first.", ""
+    try:
+        # Initialize model if needed
+        model = initialize_model()
+        # Resize image if height > 960 while maintaining aspect ratio
+        if image.height > 960:
+            aspect_ratio = image.width / image.height
+            new_height = 960
+            new_width = int(new_height * aspect_ratio)
+            print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        # Get image dimensions
+        image_width, image_height = image.size
+        # Use default prompt if not provided
+        if not prompt or prompt.strip() == "":
+            prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
+        # Detect text
+        print("Running text detection...")
+        response = model.detect_text(image, prompt)
+        # Parse results
+        detections = model.parse_detection_results(response, image_width, image_height)
+        # Merge detections first (since visualization does it internally, we need to do it here for translation)
+        from visualization import merge_detections
+        merged_detections = merge_detections(detections)
+        # Translate text in merged detections if not "Original"
+        if target_language != "Original":
+            print(f"Translating text to {target_language}...")
+            for det in merged_detections:
+                original_text = det['text']
+                translated = translate_text(original_text, target_language)
+                det['original_text'] = original_text
+                det['text'] = translated
+                print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
+        else:
+            print("Skipping translation (Original selected)")
+        # Draw boxes on image (pass merged detections and disable internal merging)
+        annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)
+        # Create summary
+        summary = get_detection_summary(merged_detections)
+        print(f"Detected {len(detections)} text regions")
+        return annotated_image, summary, response
+    except Exception as e:
+        error_msg = f"Error processing image: {str(e)}"
+        print(error_msg)
+        return None, error_msg, ""
+def translate_text(text: str, target_language: str = "Chinese") -> str:
+    """
+    Translate text to target language using model specified in .env via OpenAI-compatible API
+    """
+    try:
+        api_key = os.getenv("MODEL_ACCESS_TOKEN")
+        base_url = os.getenv("MODEL_API_URL")
+        model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback
+        if not api_key or not base_url:
+            print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
+            return text
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": text}
+            ]
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Translation error: {e}")
+        return text
+def create_demo():
+    """Create and configure the Gradio interface"""
+    with gr.Blocks(title="AI Manga Translator") as demo:
+        gr.Markdown("""
+        # 📚 AI Manga Translator
+        An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
+        **Key Capabilities:**
+        - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
+        - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
+        - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
+        - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input section
+                gr.Markdown("### 📤 Input")
+                input_image = gr.Image(
+                    type="pil",
+                    label="Upload Image",
+                    sources=["upload", "clipboard"]
+                )
+                custom_prompt = gr.Textbox(
+                    label="Custom Prompt (Optional)",
+                    placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
+                    lines=2
+                )
+                target_lang = gr.Dropdown(
+                    choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
+                    value="Chinese",
+                    label="Target Language",
+                    info="Select language for translation (Original = no translation)"
+                )
+                detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                # Output section
+                gr.Markdown("### 📊 Results")
+                output_image = gr.Image(
+                    type="pil",
+                    label="Detected Text with Bounding Boxes"
+                )
+                detection_summary = gr.Textbox(
+                    label="Detection Summary",
+                    lines=10,
+                    max_lines=20
+                )
+        # Connect the button
+        detect_btn.click(
+            fn=process_image,
+            inputs=[input_image, custom_prompt, target_lang],
+            outputs=[output_image, detection_summary]
+        )
+        # Examples
+        gr.Markdown("### 📝 Examples")
+        gr.Examples(
+            examples=[
+                ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
+                ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
+                ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
+            ],
+            inputs=[input_image, custom_prompt],
+            label="Click to use example image"
+        )
+        gr.Markdown("""
+        ---
+        ### ℹ️ About
+        This application combines state-of-the-art AI technologies to provide seamless manga translation:
+        - **OCR Engine**: HunyuanOCR.
+        - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations.
+        - **Development**: Vibe coded with **Gemini 3 Pro**.
+        """)
+    return demo
+if __name__ == "__main__":
+    # Create and launch the demo
+    print("Loading model (this may take a minute on first run)...")
+    demo = create_demo()
+    # Launch with public link option
+    demo.launch(
+        server_name="127.0.0.1",
+        share=False,  # Set to True to create a public link
+        show_error=True
+    )

examples/dandadan.png ADDED Viewed

Git LFS Details

SHA256: 7aba7ec46d88f0f0516a3602de3b06dddb2662db26ee4253937fd7fc4f5cda27
Pointer size: 131 Bytes
Size of remote file: 664 kB

examples/ruridragon.png ADDED Viewed

Git LFS Details

SHA256: f1f714f8d553328e6716bea94b80aaa7c07bbf9e4e1d240ed1dff92b51a40622
Pointer size: 131 Bytes
Size of remote file: 322 kB

examples/spyfamily.png ADDED Viewed

Git LFS Details

SHA256: 91a5e11a466f36b9ced68b6bcd95e28e64a334234c1088d59e504e4c9fdc0bcc
Pointer size: 131 Bytes
Size of remote file: 236 kB

ocr_model.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+HunyuanOCR Model Wrapper
+Provides an easy-to-use interface for text detection and recognition
+"""
+import re
+import os
+import torch
+from typing import Dict, List, Tuple, Optional
+from PIL import Image
+from transformers import AutoProcessor, HunYuanVLForConditionalGeneration
+from transformers.modeling_outputs import CausalLMOutputWithPast
+import requests
+from io import BytesIO
+# Monkey-patch HunYuanVLForConditionalGeneration.generate to fix dtype issue
+def patched_generate(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    imgs: Optional[list[torch.FloatTensor]] = None,
+    imgs_pos: Optional[list[int]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[list[int]] = None,
+    **kwargs,
+) -> CausalLMOutputWithPast:
+    if "inputs_embeds" in kwargs:
+        raise NotImplementedError("`inputs_embeds` is not supported")
+    inputs_embeds = self.model.embed_tokens(input_ids)
+    if self.vit is not None and pixel_values is not None:
+        # PATCH: Use model's dtype instead of forcing bfloat16
+        pixel_values = pixel_values.to(self.dtype)
+        image_embeds = self.vit(pixel_values, image_grid_thw)
+        # ViT may be deployed on different GPUs from those used by LLMs, due to auto-mapping of accelerate.
+        image_embeds = image_embeds.to(input_ids.device, non_blocking=True)
+        image_mask, _ = self.get_placeholder_mask(
+            input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+        )
+        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+    return super(HunYuanVLForConditionalGeneration, self).generate(
+        inputs=input_ids,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        **kwargs,
+    )
+HunYuanVLForConditionalGeneration.generate = patched_generate
+class HunyuanOCR:
+    """Wrapper class for HunyuanOCR model for text spotting tasks"""
+    def __init__(self, model_path: str = "tencent/HunyuanOCR", device: Optional[str] = None):
+        """
+        Initialize the HunyuanOCR model
+        Args:
+            model_path: Path or name of the model (default: "tencent/HunyuanOCR")
+            device: Device to load model on (cuda/cpu). Auto-detected if None.
+        """
+        # Check if local model exists when using default path
+        if model_path == "tencent/HunyuanOCR" and os.path.exists("HunyuanOCR"):
+            print("Found local HunyuanOCR model, using it instead of downloading...")
+            model_path = "HunyuanOCR"
+        self.model_path = model_path
+        # Auto-detect device if not specified
+        if device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        print(f"Loading HunyuanOCR model on {self.device}...")
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
+        # Determine dtype based on device
+        if self.device == "cuda":
+            torch_dtype = torch.bfloat16
+        elif self.device == "mps":
+            torch_dtype = torch.float16
+        else:
+            torch_dtype = torch.float32
+        # Load model
+        self.model = HunYuanVLForConditionalGeneration.from_pretrained(
+            model_path,
+            attn_implementation="eager",
+            torch_dtype=torch_dtype,
+            device_map="auto" if self.device == "cuda" else None
+        )
+        if self.device != "cuda":
+            self.model = self.model.to(self.device)
+        print("Model loaded successfully!")
+    def clean_repeated_substrings(self, text: str) -> str:
+        """
+        Clean repeated substrings in text output
+        Args:
+            text: Input text to clean
+        Returns:
+            Cleaned text
+        """
+        n = len(text)
+        if n < 8000:
+            return text
+        for length in range(2, n // 10 + 1):
+            candidate = text[-length:]
+            count = 0
+            i = n - length
+            while i >= 0 and text[i:i + length] == candidate:
+                count += 1
+                i -= length
+            if count >= 10:
+                return text[:n - length * (count - 1)]
+        return text
+    def load_image(self, image_source: str) -> Image.Image:
+        """
+        Load image from URL or file path
+        Args:
+            image_source: URL or file path to image
+        Returns:
+            PIL Image object
+        """
+        if image_source.startswith(('http://', 'https://')):
+            response = requests.get(image_source)
+            response.raise_for_status()
+            return Image.open(BytesIO(response.content))
+        else:
+            return Image.open(image_source)
+    def detect_text(self, image: Image.Image, prompt: Optional[str] = None) -> str:
+        """
+        Detect and recognize text in image with bounding boxes
+        Args:
+            image: PIL Image object
+            prompt: Custom prompt (default: text spotting prompt in Chinese)
+        Returns:
+            Model response with detected text and coordinates
+        """
+        # Default prompt for text spotting
+        if prompt is None:
+            prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
+        # Prepare messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        # Apply chat template
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process inputs
+        inputs = self.processor(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt",
+        )
+        # Generate
+        with torch.no_grad():
+            # Get model's dtype
+            model_dtype = next(self.model.parameters()).dtype
+            if self.device == "cuda":
+                device = next(self.model.parameters()).device
+                inputs = inputs.to(device)
+            else:
+                # Move to device and cast floating point tensors to model's dtype
+                new_inputs = {}
+                for k, v in inputs.items():
+                    if torch.is_tensor(v):
+                        v = v.to(self.device)
+                        if v.dtype in [torch.float16, torch.bfloat16, torch.float32]:
+                            v = v.to(dtype=model_dtype)
+                        new_inputs[k] = v
+                    else:
+                        new_inputs[k] = v
+                inputs = new_inputs
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=2048,
+                do_sample=False
+            )
+        # Decode output
+        if "input_ids" in inputs:
+            input_ids = inputs["input_ids"]
+        else:
+            input_ids = inputs["inputs"]
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):]
+            for in_ids, out_ids in zip(input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        # Clean repeated substrings
+        output_text = self.clean_repeated_substrings(output_text)
+        return output_text
+    def parse_detection_results(self, response: str, image_width: int, image_height: int) -> List[Dict]:
+        """
+        Parse detection response into structured format with denormalized coordinates
+        Args:
+            response: Model output text
+            image_width: Image width in pixels
+            image_height: Image height in pixels
+        Returns:
+            List of dictionaries with 'text', 'x1', 'y1', 'x2', 'y2' keys
+        """
+        results = []
+        # Pattern to match text and coordinates: text(x1,y1),(x2,y2)
+        pattern = r'([^()]+?)(\(\d+,\d+\),\(\d+,\d+\))'
+        matches = re.finditer(pattern, response)
+        for match in matches:
+            try:
+                text = match.group(1).strip()
+                coords = match.group(2)
+                # Parse coordinates
+                coord_pattern = r'\((\d+),(\d+)\)'
+                coord_matches = re.findall(coord_pattern, coords)
+                if len(coord_matches) == 2:
+                    # Coordinates are normalized to [0, 1000], denormalize them
+                    x1_norm, y1_norm = float(coord_matches[0][0]), float(coord_matches[0][1])
+                    x2_norm, y2_norm = float(coord_matches[1][0]), float(coord_matches[1][1])
+                    # Denormalize to image dimensions
+                    x1 = int(x1_norm * image_width / 1000)
+                    y1 = int(y1_norm * image_height / 1000)
+                    x2 = int(x2_norm * image_width / 1000)
+                    y2 = int(y2_norm * image_height / 1000)
+                    results.append({
+                        'text': text,
+                        'x1': x1,
+                        'y1': y1,
+                        'x2': x2,
+                        'y2': y2
+                    })
+            except Exception as e:
+                print(f"Error parsing detection result: {str(e)}")
+                continue
+        return results
+    def process_image(self, image_source: str, prompt: Optional[str] = None) -> Tuple[str, List[Dict]]:
+        """
+        Complete pipeline: load image, detect text, parse results
+        Args:
+            image_source: Path or URL to image
+            prompt: Custom prompt for detection
+        Returns:
+            Tuple of (raw_response, parsed_results)
+        """
+        # Load image
+        image = self.load_image(image_source)
+        image_width, image_height = image.size
+        # Detect text
+        response = self.detect_text(image, prompt)
+        # Parse results
+        parsed_results = self.parse_detection_results(response, image_width, image_height)
+        return response, parsed_results, image

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+torch>=2.0.0
+torchvision>=0.15.0
+git+https://github.com/huggingface/transformers@82a06db03535c49aa987719ed0746a76093b1ec4
+Pillow>=10.0.0
+numpy>=1.24.0
+requests>=2.31.0
+openai>=1.0.0
+python-dotenv>=1.0.0

visualization.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+Visualization utilities for drawing text detection boxes on images
+"""
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from typing import List, Dict, Tuple
+import os
+import math
+def generate_random_color() -> Tuple[int, int, int]:
+    """
+    Generate a random color for bounding boxes
+    Returns:
+        RGB color tuple
+    """
+    return (
+        np.random.randint(0, 200),
+        np.random.randint(0, 200),
+        np.random.randint(0, 255)
+    )
+def draw_detection_boxes(
+    image: Image.Image,
+    detections: List[Dict],
+    box_width: int = 3,
+    font_size: int = 12,
+    show_text: bool = True,
+    merge_boxes: bool = True
+) -> Image.Image:
+    """
+    Draw text detection boxes with labels on image
+    Args:
+        image: PIL Image to draw on
+        detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2'
+        box_width: Width of bounding box lines
+        font_size: Font size for text labels
+        show_text: Whether to show text labels
+        merge_boxes: Whether to merge close boxes (default: True)
+    Returns:
+        New image with boxes and labels drawn
+    """
+    # Merge detections if requested
+    if merge_boxes:
+        detections = merge_detections(detections)
+    # Create a copy of the image
+    img_draw = image.copy().convert('RGBA')
+    # Create transparent overlay for semi-transparent boxes
+    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
+    draw_overlay = ImageDraw.Draw(overlay)
+    draw = ImageDraw.Draw(img_draw)
+    # Try to load a better font that supports CJK (Chinese/Japanese/Korean)
+    # Prioritize local fonts folder for portability
+    font_paths = [
+        # Local fonts (project/fonts/) - Prioritize slim/light fonts
+        os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"),
+        os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"),
+        os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"),
+        # macOS fonts
+        "/System/Library/Fonts/STHeiti Light.ttc",
+        "/System/Library/Fonts/PingFang.ttc",
+        "/System/Library/Fonts/Hiragino Sans GB.ttc",
+        # Linux fonts
+        "/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc",
+        "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
+        "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
+    ]
+    font = None
+    valid_font_path = None
+    for path in font_paths:
+        try:
+            font = ImageFont.truetype(path, font_size)
+            valid_font_path = path
+            break
+        except (IOError, OSError):
+            continue
+    if font is None:
+        # Fallback to default if no custom font loaded
+        font = ImageFont.load_default()
+    # Draw each detection
+    for i, detection in enumerate(detections, 1):
+        try:
+            text = detection['text']
+            x1, y1 = detection['x1'], detection['y1']
+            x2, y2 = detection['x2'], detection['y2']
+            # Calculate box dimensions
+            box_w = x2 - x1
+            box_h = y2 - y1
+            # Helper function to wrap text and calculate size
+            def get_text_layout(text, font, max_width):
+                lines = []
+                raw_lines = text.split('\n')
+                for raw_line in raw_lines:
+                    current_line = ""
+                    for char in raw_line:
+                        test_line = current_line + char
+                        bbox = draw.textbbox((0, 0), test_line, font=font)
+                        if bbox[2] - bbox[0] < max_width:
+                            current_line = test_line
+                        else:
+                            if current_line:
+                                lines.append(current_line)
+                            current_line = char
+                    if current_line:
+                        lines.append(current_line)
+                # Calculate total height
+                if not lines:
+                    return [], 0, 0
+                # Get line height from font metrics
+                ascent, descent = font.getmetrics()
+                line_height = ascent + descent
+                total_height = len(lines) * line_height * 1.2 # 1.2 line spacing
+                max_line_w = 0
+                for line in lines:
+                    bbox = draw.textbbox((0, 0), line, font=font)
+                    max_line_w = max(max_line_w, bbox[2] - bbox[0])
+                return lines, total_height, max_line_w
+            # Iteratively find best font size
+            min_font_size = 12
+            max_font_size = 60
+            best_font_size = min_font_size
+            best_lines = []
+            # Try sizes from max to min
+            for size in range(max_font_size, min_font_size - 1, -2):
+                try:
+                    if valid_font_path:
+                        test_font = ImageFont.truetype(valid_font_path, size)
+                    else:
+                        test_font = ImageFont.load_default()
+                except:
+                    test_font = ImageFont.load_default()
+                lines, total_h, max_w = get_text_layout(text, test_font, box_w - 8) # 8px padding
+                if total_h <= box_h - 4 and max_w <= box_w - 4:
+                    best_font_size = size
+                    best_lines = lines
+                    break
+            # If even min size doesn't fit, calculate required box expansion
+            font_size_to_use = best_font_size
+            try:
+                if valid_font_path:
+                    font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use)
+                else:
+                    font_to_use = ImageFont.load_default()
+            except:
+                font_to_use = ImageFont.load_default()
+            # Calculate max allowed dimensions (max 20% larger)
+            max_allowed_w = int(box_w * 1.2)
+            max_allowed_h = int(box_h * 1.2)
+            # Try layout with max allowed width to minimize height
+            # Use -8 for padding (4px left, 4px right)
+            lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8)
+            # Determine new dimensions, capped at 20% expansion
+            # We ensure we don't shrink below original size
+            new_w = max(box_w, min(max_line_w + 8, max_allowed_w))
+            new_h = max(box_h, min(total_h + 4, max_allowed_h))
+            # Update box coordinates
+            x2 = x1 + new_w
+            y2 = y1 + new_h
+            box_w = new_w
+            box_h = new_h
+            # 1. Cover original text with white background (using potentially expanded box)
+            draw.rectangle(
+                [x1, y1, x2, y2],
+                fill=(255, 255, 255),
+                outline=(0, 0, 0),
+                width=2
+            )
+            # 4. Draw text left-aligned horizontally and centered vertically
+            # Get metrics again for drawing
+            ascent, descent = font_to_use.getmetrics()
+            line_height = (ascent + descent) * 1.2
+            start_y = y1 + (box_h - total_h) / 2
+            for j, line in enumerate(lines):
+                # Left align with small padding
+                line_x = x1 + 4
+                line_y = start_y + j * line_height
+                draw.text((line_x, line_y), line, font=font_to_use, fill=(0, 0, 0))
+        except Exception as e:
+            print(f"Error drawing detection box: {str(e)}")
+            continue
+        except Exception as e:
+            print(f"Error drawing detection box: {str(e)}")
+            continue
+    # Composite the overlay onto the image
+    img_draw.paste(overlay, (0, 0), overlay)
+    # Convert back to RGB
+    return img_draw.convert('RGB')
+def create_side_by_side_comparison(
+    original: Image.Image,
+    annotated: Image.Image,
+    spacing: int = 20
+) -> Image.Image:
+    """
+    Create side-by-side comparison of original and annotated images
+    Args:
+        original: Original image
+        annotated: Annotated image with boxes
+        spacing: Space between images in pixels
+    Returns:
+        Combined image showing both versions
+    """
+    # Get dimensions
+    width1, height1 = original.size
+    width2, height2 = annotated.size
+    # Create new image
+    total_width = width1 + width2 + spacing
+    total_height = max(height1, height2)
+    combined = Image.new('RGB', (total_width, total_height), (255, 255, 255))
+    # Paste images
+    combined.paste(original, (0, 0))
+    combined.paste(annotated, (width1 + spacing, 0))
+    # Add labels
+    draw = ImageDraw.Draw(combined)
+    # Try to load a better font that supports CJK
+    font_paths = [
+        "/System/Library/Fonts/PingFang.ttc",
+        "/System/Library/Fonts/Hiragino Sans GB.ttc",
+        "/System/Library/Fonts/STHeiti Light.ttc",
+        "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
+        "/System/Library/Fonts/Supplemental/Arial.ttf",
+        "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
+    ]
+    font = None
+    for path in font_paths:
+        try:
+            font = ImageFont.truetype(path, 24)
+            break
+        except (IOError, OSError):
+            continue
+    if font is None:
+        font = ImageFont.load_default()
+    draw.text((10, 10), "Original", font=font, fill=(0, 0, 0))
+    draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0))
+    return combined
+def get_detection_summary(detections: List[Dict]) -> str:
+    """
+    Create a text summary of detection results
+    Args:
+        detections: List of detection dictionaries
+    Returns:
+        Formatted summary string
+    """
+    if not detections:
+        return "No text detected in the image."
+    summary = f"Detected {len(detections)} text region(s):\n\n"
+    for i, det in enumerate(detections, 1):
+        if 'original_text' in det and det['original_text'] != det['text']:
+            summary += f"{i}. Original: \"{det['original_text']}\"\n"
+            summary += f"   Translated: \"{det['text']}\"\n"
+        else:
+            summary += f"{i}. \"{det['text']}\"\n"
+        summary += f"   Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n"
+    return summary
+def merge_detections(detections: List[Dict], threshold: int = 50) -> List[Dict]:
+    """
+    Merge close detection boxes into single boxes
+    Args:
+        detections: List of detection dicts
+        threshold: Distance threshold for merging
+    Returns:
+        List of merged detection dicts
+    """
+    if not detections:
+        return []
+    # Helper to check if two boxes are close
+    def are_close(box1, box2, thresh):
+        # Expand box1 by thresh
+        b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh
+        b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh
+        # Check overlap with box2
+        return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or
+                    b1_y2 < box2['y1'] or b1_y1 > box2['y2'])
+    # Build adjacency list
+    n = len(detections)
+    adj = [[] for _ in range(n)]
+    for i in range(n):
+        for j in range(i + 1, n):
+            if are_close(detections[i], detections[j], threshold):
+                adj[i].append(j)
+                adj[j].append(i)
+    # Find connected components
+    visited = [False] * n
+    merged_results = []
+    for i in range(n):
+        if not visited[i]:
+            # BFS to find component
+            component = []
+            stack = [i]
+            visited[i] = True
+            while stack:
+                curr = stack.pop()
+                component.append(detections[curr])
+                for neighbor in adj[curr]:
+                    if not visited[neighbor]:
+                        visited[neighbor] = True
+                        stack.append(neighbor)
+            # Merge component
+            if not component:
+                continue
+            # Calculate merged bounds
+            min_x1 = min(d['x1'] for d in component)
+            min_y1 = min(d['y1'] for d in component)
+            max_x2 = max(d['x2'] for d in component)
+            max_y2 = max(d['y2'] for d in component)
+            # Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y)
+            # This is standard for Manga reading order
+            component.sort(key=lambda d: (-d['x1'], d['y1']))
+            merged_text = "".join(d['text'] for d in component).replace(" ", "")
+            merged_results.append({
+                'text': merged_text,
+                'x1': min_x1,
+                'y1': min_y1,
+                'x2': max_x2,
+                'y2': max_y2
+            })
+    return merged_results