jzhang533 commited on
Commit
5b3defa
·
1 Parent(s): 0a9cd06

ai manga translator

Browse files

Signed-off-by: Zhang Jun <[email protected]>

.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ MODEL_ACCESS_TOKEN=your_token_here
2
+ MODEL_API_URL=your_api_url_here
3
+ MODEL_NAME=your_model_name_here
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/dandadan.png filter=lfs diff=lfs merge=lfs -text
37
+ examples/ruridragon.png filter=lfs diff=lfs merge=lfs -text
38
+ examples/spyfamily.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .DS_Store
4
+ venv/
5
+ .venv/
6
+ .ipynb_checkpoints/
7
+
8
+ # Environment variables
9
+ .env
10
+ .env.local
11
+ .env.development.local
12
+ .env.test.local
13
+ .env.production.local
README.md CHANGED
@@ -10,4 +10,23 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: apache-2.0
11
  ---
12
 
13
+ # 📚 AI Manga Translator
14
+
15
+ An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
16
+
17
+ **Key Capabilities:**
18
+ - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
19
+ - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
20
+ - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
21
+
22
+ ## Technologies
23
+ - **OCR Engine**: HunyuanOCR
24
+ - **Translation**: ERNIE 4.5 (via API)
25
+ - **Development**: Vibe coded with Gemini 3 Pro
26
+
27
+ ## Setup
28
+ To run this locally:
29
+ 1. Install dependencies: `pip install -r requirements.txt`
30
+ 2. Set up `.env`
31
+ 3. Run `python app.py`.
32
+
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Web UI for HunyuanOCR Text Spotting
3
+ Upload an image and get text detection with bounding boxes
4
+ """
5
+ import gradio as gr
6
+ from PIL import Image
7
+ import os
8
+ from ocr_model import HunyuanOCR
9
+ from visualization import draw_detection_boxes, get_detection_summary
10
+ from dotenv import load_dotenv
11
+ from openai import OpenAI
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
+ # Global model instance (loaded once)
17
+ ocr_model = None
18
+
19
+
20
+ def initialize_model():
21
+ """Initialize the OCR model (called once at startup)"""
22
+ global ocr_model
23
+ if ocr_model is None:
24
+ print("Initializing HunyuanOCR model...")
25
+ ocr_model = HunyuanOCR()
26
+ print("Model ready!")
27
+ return ocr_model
28
+
29
+
30
+ def process_image(image: Image.Image, prompt: str = None, target_language: str = "Chinese"):
31
+ """
32
+ Process uploaded image and return annotated result
33
+
34
+ Args:
35
+ image: PIL Image from Gradio
36
+ prompt: Optional custom prompt
37
+ target_language: Target language for translation (Original, Chinese, English, French, etc.)
38
+
39
+ Returns:
40
+ Tuple of (annotated_image, detection_summary, raw_response)
41
+ """
42
+ if image is None:
43
+ return None, "Please upload an image first.", ""
44
+
45
+ try:
46
+ # Initialize model if needed
47
+ model = initialize_model()
48
+
49
+ # Resize image if height > 960 while maintaining aspect ratio
50
+ if image.height > 960:
51
+ aspect_ratio = image.width / image.height
52
+ new_height = 960
53
+ new_width = int(new_height * aspect_ratio)
54
+ print(f"Resizing image from {image.size} to ({new_width}, {new_height})")
55
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
56
+
57
+ # Get image dimensions
58
+ image_width, image_height = image.size
59
+
60
+ # Use default prompt if not provided
61
+ if not prompt or prompt.strip() == "":
62
+ prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
63
+
64
+ # Detect text
65
+ print("Running text detection...")
66
+ response = model.detect_text(image, prompt)
67
+
68
+ # Parse results
69
+ detections = model.parse_detection_results(response, image_width, image_height)
70
+
71
+ # Merge detections first (since visualization does it internally, we need to do it here for translation)
72
+ from visualization import merge_detections
73
+ merged_detections = merge_detections(detections)
74
+
75
+ # Translate text in merged detections if not "Original"
76
+ if target_language != "Original":
77
+ print(f"Translating text to {target_language}...")
78
+ for det in merged_detections:
79
+ original_text = det['text']
80
+ translated = translate_text(original_text, target_language)
81
+ det['original_text'] = original_text
82
+ det['text'] = translated
83
+ print(f"Translated: {original_text[:20]}... -> {translated[:20]}...")
84
+ else:
85
+ print("Skipping translation (Original selected)")
86
+
87
+ # Draw boxes on image (pass merged detections and disable internal merging)
88
+ annotated_image = draw_detection_boxes(image, merged_detections, merge_boxes=False)
89
+
90
+ # Create summary
91
+ summary = get_detection_summary(merged_detections)
92
+
93
+ print(f"Detected {len(detections)} text regions")
94
+
95
+ return annotated_image, summary, response
96
+
97
+ except Exception as e:
98
+ error_msg = f"Error processing image: {str(e)}"
99
+ print(error_msg)
100
+ return None, error_msg, ""
101
+
102
+
103
+ def translate_text(text: str, target_language: str = "Chinese") -> str:
104
+ """
105
+ Translate text to target language using model specified in .env via OpenAI-compatible API
106
+ """
107
+ try:
108
+ api_key = os.getenv("MODEL_ACCESS_TOKEN")
109
+ base_url = os.getenv("MODEL_API_URL")
110
+ model_name = os.getenv("MODEL_NAME", "ernie-4.5-turbo-128k") # Default fallback
111
+
112
+ if not api_key or not base_url:
113
+ print("Warning: MODEL_ACCESS_TOKEN or MODEL_API_URL not found in .env")
114
+ return text
115
+
116
+ client = OpenAI(api_key=api_key, base_url=base_url)
117
+
118
+ system_prompt = f"You are a professional manga translator. The following text is from a Japanese manga. Translate it into natural and expressive {target_language}, maintaining the character's tone and the context of the scene. Only output the translation, no explanations."
119
+
120
+ response = client.chat.completions.create(
121
+ model=model_name,
122
+ messages=[
123
+ {"role": "system", "content": system_prompt},
124
+ {"role": "user", "content": text}
125
+ ]
126
+ )
127
+
128
+ return response.choices[0].message.content.strip()
129
+ except Exception as e:
130
+ print(f"Translation error: {e}")
131
+ return text
132
+
133
+
134
+ def create_demo():
135
+ """Create and configure the Gradio interface"""
136
+
137
+ with gr.Blocks(title="AI Manga Translator") as demo:
138
+ gr.Markdown("""
139
+ # 📚 AI Manga Translator
140
+
141
+ An intelligent tool designed to detect, recognize, and translate text in images, with specialized features for Manga and Comics.
142
+
143
+ **Key Capabilities:**
144
+ - 🖌️ **Smart Text Replacement**: Automatically detects text bubbles, wipes them clean, and overlays translated text.
145
+ - 📖 **Manga-Optimized**: Handles vertical text and right-to-left reading order correctly.
146
+ - 🌏 **Multi-Language Translation**: Translates detected text into your preferred language (Chinese, English, French, etc.).
147
+ - 🔍 **High-Precision OCR**: Accurately spots text even in complex backgrounds.
148
+ """)
149
+
150
+ with gr.Row():
151
+ with gr.Column(scale=1):
152
+ # Input section
153
+ gr.Markdown("### 📤 Input")
154
+ input_image = gr.Image(
155
+ type="pil",
156
+ label="Upload Image",
157
+ sources=["upload", "clipboard"]
158
+ )
159
+
160
+ custom_prompt = gr.Textbox(
161
+ label="Custom Prompt (Optional)",
162
+ placeholder="检测并识别图片中的文字,将文本内容与坐标格式化输出。",
163
+ lines=2
164
+ )
165
+
166
+ target_lang = gr.Dropdown(
167
+ choices=["Original", "Chinese", "English", "French", "German", "Spanish", "Korean", "Japanese"],
168
+ value="Chinese",
169
+ label="Target Language",
170
+ info="Select language for translation (Original = no translation)"
171
+ )
172
+
173
+ detect_btn = gr.Button("🔍 Detect & Translate", variant="primary", size="lg")
174
+
175
+ with gr.Column(scale=1):
176
+ # Output section
177
+ gr.Markdown("### 📊 Results")
178
+ output_image = gr.Image(
179
+ type="pil",
180
+ label="Detected Text with Bounding Boxes"
181
+ )
182
+
183
+ detection_summary = gr.Textbox(
184
+ label="Detection Summary",
185
+ lines=10,
186
+ max_lines=20
187
+ )
188
+
189
+ # Connect the button
190
+ detect_btn.click(
191
+ fn=process_image,
192
+ inputs=[input_image, custom_prompt, target_lang],
193
+ outputs=[output_image, detection_summary]
194
+ )
195
+
196
+ # Examples
197
+ gr.Markdown("### 📝 Examples")
198
+ gr.Examples(
199
+ examples=[
200
+ ["examples/dandadan.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
201
+ ["examples/ruridragon.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
202
+ ["examples/spyfamily.png", "检测并识别图片中的文字,将文本内容与坐标格式化输出。"],
203
+ ],
204
+ inputs=[input_image, custom_prompt],
205
+ label="Click to use example image"
206
+ )
207
+
208
+ gr.Markdown("""
209
+ ---
210
+ ### ℹ️ About
211
+
212
+ This application combines state-of-the-art AI technologies to provide seamless manga translation:
213
+
214
+ - **OCR Engine**: HunyuanOCR.
215
+ - **Translation**: Powered by **ERNIE 4.5** for natural and context-aware translations.
216
+ - **Development**: Vibe coded with **Gemini 3 Pro**.
217
+ """)
218
+
219
+ return demo
220
+
221
+
222
+ if __name__ == "__main__":
223
+ # Create and launch the demo
224
+ print("Loading model (this may take a minute on first run)...")
225
+
226
+ demo = create_demo()
227
+
228
+ # Launch with public link option
229
+ demo.launch(
230
+ server_name="127.0.0.1",
231
+ share=False, # Set to True to create a public link
232
+ show_error=True
233
+ )
examples/dandadan.png ADDED

Git LFS Details

  • SHA256: 7aba7ec46d88f0f0516a3602de3b06dddb2662db26ee4253937fd7fc4f5cda27
  • Pointer size: 131 Bytes
  • Size of remote file: 664 kB
examples/ruridragon.png ADDED

Git LFS Details

  • SHA256: f1f714f8d553328e6716bea94b80aaa7c07bbf9e4e1d240ed1dff92b51a40622
  • Pointer size: 131 Bytes
  • Size of remote file: 322 kB
examples/spyfamily.png ADDED

Git LFS Details

  • SHA256: 91a5e11a466f36b9ced68b6bcd95e28e64a334234c1088d59e504e4c9fdc0bcc
  • Pointer size: 131 Bytes
  • Size of remote file: 236 kB
ocr_model.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HunyuanOCR Model Wrapper
3
+ Provides an easy-to-use interface for text detection and recognition
4
+ """
5
+ import re
6
+ import os
7
+ import torch
8
+ from typing import Dict, List, Tuple, Optional
9
+ from PIL import Image
10
+ from transformers import AutoProcessor, HunYuanVLForConditionalGeneration
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+ import requests
13
+ from io import BytesIO
14
+
15
+ # Monkey-patch HunYuanVLForConditionalGeneration.generate to fix dtype issue
16
+ def patched_generate(
17
+ self,
18
+ input_ids: Optional[torch.LongTensor] = None,
19
+ attention_mask: Optional[torch.Tensor] = None,
20
+ position_ids: Optional[torch.LongTensor] = None,
21
+ imgs: Optional[list[torch.FloatTensor]] = None,
22
+ imgs_pos: Optional[list[int]] = None,
23
+ token_type_ids: Optional[torch.LongTensor] = None,
24
+ pixel_values: Optional[torch.FloatTensor] = None,
25
+ image_grid_thw: Optional[list[int]] = None,
26
+ **kwargs,
27
+ ) -> CausalLMOutputWithPast:
28
+ if "inputs_embeds" in kwargs:
29
+ raise NotImplementedError("`inputs_embeds` is not supported")
30
+
31
+ inputs_embeds = self.model.embed_tokens(input_ids)
32
+
33
+ if self.vit is not None and pixel_values is not None:
34
+ # PATCH: Use model's dtype instead of forcing bfloat16
35
+ pixel_values = pixel_values.to(self.dtype)
36
+ image_embeds = self.vit(pixel_values, image_grid_thw)
37
+
38
+ # ViT may be deployed on different GPUs from those used by LLMs, due to auto-mapping of accelerate.
39
+ image_embeds = image_embeds.to(input_ids.device, non_blocking=True)
40
+
41
+ image_mask, _ = self.get_placeholder_mask(
42
+ input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
43
+ )
44
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
45
+
46
+ return super(HunYuanVLForConditionalGeneration, self).generate(
47
+ inputs=input_ids,
48
+ position_ids=position_ids,
49
+ attention_mask=attention_mask,
50
+ inputs_embeds=inputs_embeds,
51
+ **kwargs,
52
+ )
53
+
54
+ HunYuanVLForConditionalGeneration.generate = patched_generate
55
+
56
+
57
+ class HunyuanOCR:
58
+ """Wrapper class for HunyuanOCR model for text spotting tasks"""
59
+
60
+ def __init__(self, model_path: str = "tencent/HunyuanOCR", device: Optional[str] = None):
61
+ """
62
+ Initialize the HunyuanOCR model
63
+
64
+ Args:
65
+ model_path: Path or name of the model (default: "tencent/HunyuanOCR")
66
+ device: Device to load model on (cuda/cpu). Auto-detected if None.
67
+ """
68
+ # Check if local model exists when using default path
69
+ if model_path == "tencent/HunyuanOCR" and os.path.exists("HunyuanOCR"):
70
+ print("Found local HunyuanOCR model, using it instead of downloading...")
71
+ model_path = "HunyuanOCR"
72
+
73
+ self.model_path = model_path
74
+
75
+ # Auto-detect device if not specified
76
+ if device is None:
77
+ if torch.cuda.is_available():
78
+ self.device = "cuda"
79
+ elif torch.backends.mps.is_available():
80
+ self.device = "mps"
81
+ else:
82
+ self.device = "cpu"
83
+ else:
84
+ self.device = device
85
+
86
+ print(f"Loading HunyuanOCR model on {self.device}...")
87
+
88
+ # Load processor
89
+ self.processor = AutoProcessor.from_pretrained(model_path, use_fast=False)
90
+
91
+ # Determine dtype based on device
92
+ if self.device == "cuda":
93
+ torch_dtype = torch.bfloat16
94
+ elif self.device == "mps":
95
+ torch_dtype = torch.float16
96
+ else:
97
+ torch_dtype = torch.float32
98
+
99
+ # Load model
100
+ self.model = HunYuanVLForConditionalGeneration.from_pretrained(
101
+ model_path,
102
+ attn_implementation="eager",
103
+ torch_dtype=torch_dtype,
104
+ device_map="auto" if self.device == "cuda" else None
105
+ )
106
+
107
+ if self.device != "cuda":
108
+ self.model = self.model.to(self.device)
109
+
110
+ print("Model loaded successfully!")
111
+
112
+ def clean_repeated_substrings(self, text: str) -> str:
113
+ """
114
+ Clean repeated substrings in text output
115
+
116
+ Args:
117
+ text: Input text to clean
118
+
119
+ Returns:
120
+ Cleaned text
121
+ """
122
+ n = len(text)
123
+ if n < 8000:
124
+ return text
125
+
126
+ for length in range(2, n // 10 + 1):
127
+ candidate = text[-length:]
128
+ count = 0
129
+ i = n - length
130
+
131
+ while i >= 0 and text[i:i + length] == candidate:
132
+ count += 1
133
+ i -= length
134
+
135
+ if count >= 10:
136
+ return text[:n - length * (count - 1)]
137
+
138
+ return text
139
+
140
+ def load_image(self, image_source: str) -> Image.Image:
141
+ """
142
+ Load image from URL or file path
143
+
144
+ Args:
145
+ image_source: URL or file path to image
146
+
147
+ Returns:
148
+ PIL Image object
149
+ """
150
+ if image_source.startswith(('http://', 'https://')):
151
+ response = requests.get(image_source)
152
+ response.raise_for_status()
153
+ return Image.open(BytesIO(response.content))
154
+ else:
155
+ return Image.open(image_source)
156
+
157
+ def detect_text(self, image: Image.Image, prompt: Optional[str] = None) -> str:
158
+ """
159
+ Detect and recognize text in image with bounding boxes
160
+
161
+ Args:
162
+ image: PIL Image object
163
+ prompt: Custom prompt (default: text spotting prompt in Chinese)
164
+
165
+ Returns:
166
+ Model response with detected text and coordinates
167
+ """
168
+ # Default prompt for text spotting
169
+ if prompt is None:
170
+ prompt = "检测并识别图片中的文字,将文本内容与坐标格式化输出。"
171
+
172
+ # Prepare messages
173
+ messages = [
174
+ {
175
+ "role": "user",
176
+ "content": [
177
+ {"type": "image"},
178
+ {"type": "text", "text": prompt},
179
+ ],
180
+ }
181
+ ]
182
+
183
+ # Apply chat template
184
+ text = self.processor.apply_chat_template(
185
+ messages,
186
+ tokenize=False,
187
+ add_generation_prompt=True
188
+ )
189
+
190
+ # Process inputs
191
+ inputs = self.processor(
192
+ text=[text],
193
+ images=[image],
194
+ padding=True,
195
+ return_tensors="pt",
196
+ )
197
+
198
+ # Generate
199
+ with torch.no_grad():
200
+ # Get model's dtype
201
+ model_dtype = next(self.model.parameters()).dtype
202
+
203
+ if self.device == "cuda":
204
+ device = next(self.model.parameters()).device
205
+ inputs = inputs.to(device)
206
+ else:
207
+ # Move to device and cast floating point tensors to model's dtype
208
+ new_inputs = {}
209
+ for k, v in inputs.items():
210
+ if torch.is_tensor(v):
211
+ v = v.to(self.device)
212
+ if v.dtype in [torch.float16, torch.bfloat16, torch.float32]:
213
+ v = v.to(dtype=model_dtype)
214
+ new_inputs[k] = v
215
+ else:
216
+ new_inputs[k] = v
217
+ inputs = new_inputs
218
+
219
+ generated_ids = self.model.generate(
220
+ **inputs,
221
+ max_new_tokens=2048,
222
+ do_sample=False
223
+ )
224
+
225
+ # Decode output
226
+ if "input_ids" in inputs:
227
+ input_ids = inputs["input_ids"]
228
+ else:
229
+ input_ids = inputs["inputs"]
230
+
231
+ generated_ids_trimmed = [
232
+ out_ids[len(in_ids):]
233
+ for in_ids, out_ids in zip(input_ids, generated_ids)
234
+ ]
235
+
236
+ output_text = self.processor.batch_decode(
237
+ generated_ids_trimmed,
238
+ skip_special_tokens=True,
239
+ clean_up_tokenization_spaces=False
240
+ )[0]
241
+
242
+ # Clean repeated substrings
243
+ output_text = self.clean_repeated_substrings(output_text)
244
+
245
+ return output_text
246
+
247
+ def parse_detection_results(self, response: str, image_width: int, image_height: int) -> List[Dict]:
248
+ """
249
+ Parse detection response into structured format with denormalized coordinates
250
+
251
+ Args:
252
+ response: Model output text
253
+ image_width: Image width in pixels
254
+ image_height: Image height in pixels
255
+
256
+ Returns:
257
+ List of dictionaries with 'text', 'x1', 'y1', 'x2', 'y2' keys
258
+ """
259
+ results = []
260
+
261
+ # Pattern to match text and coordinates: text(x1,y1),(x2,y2)
262
+ pattern = r'([^()]+?)(\(\d+,\d+\),\(\d+,\d+\))'
263
+ matches = re.finditer(pattern, response)
264
+
265
+ for match in matches:
266
+ try:
267
+ text = match.group(1).strip()
268
+ coords = match.group(2)
269
+
270
+ # Parse coordinates
271
+ coord_pattern = r'\((\d+),(\d+)\)'
272
+ coord_matches = re.findall(coord_pattern, coords)
273
+
274
+ if len(coord_matches) == 2:
275
+ # Coordinates are normalized to [0, 1000], denormalize them
276
+ x1_norm, y1_norm = float(coord_matches[0][0]), float(coord_matches[0][1])
277
+ x2_norm, y2_norm = float(coord_matches[1][0]), float(coord_matches[1][1])
278
+
279
+ # Denormalize to image dimensions
280
+ x1 = int(x1_norm * image_width / 1000)
281
+ y1 = int(y1_norm * image_height / 1000)
282
+ x2 = int(x2_norm * image_width / 1000)
283
+ y2 = int(y2_norm * image_height / 1000)
284
+
285
+ results.append({
286
+ 'text': text,
287
+ 'x1': x1,
288
+ 'y1': y1,
289
+ 'x2': x2,
290
+ 'y2': y2
291
+ })
292
+ except Exception as e:
293
+ print(f"Error parsing detection result: {str(e)}")
294
+ continue
295
+
296
+ return results
297
+
298
+ def process_image(self, image_source: str, prompt: Optional[str] = None) -> Tuple[str, List[Dict]]:
299
+ """
300
+ Complete pipeline: load image, detect text, parse results
301
+
302
+ Args:
303
+ image_source: Path or URL to image
304
+ prompt: Custom prompt for detection
305
+
306
+ Returns:
307
+ Tuple of (raw_response, parsed_results)
308
+ """
309
+ # Load image
310
+ image = self.load_image(image_source)
311
+ image_width, image_height = image.size
312
+
313
+ # Detect text
314
+ response = self.detect_text(image, prompt)
315
+
316
+ # Parse results
317
+ parsed_results = self.parse_detection_results(response, image_width, image_height)
318
+
319
+ return response, parsed_results, image
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=2.0.0
3
+ torchvision>=0.15.0
4
+ git+https://github.com/huggingface/transformers@82a06db03535c49aa987719ed0746a76093b1ec4
5
+ Pillow>=10.0.0
6
+ numpy>=1.24.0
7
+ requests>=2.31.0
8
+ openai>=1.0.0
9
+ python-dotenv>=1.0.0
visualization.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visualization utilities for drawing text detection boxes on images
3
+ """
4
+ import numpy as np
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ from typing import List, Dict, Tuple
7
+ import os
8
+ import math
9
+
10
+ def generate_random_color() -> Tuple[int, int, int]:
11
+ """
12
+ Generate a random color for bounding boxes
13
+
14
+ Returns:
15
+ RGB color tuple
16
+ """
17
+ return (
18
+ np.random.randint(0, 200),
19
+ np.random.randint(0, 200),
20
+ np.random.randint(0, 255)
21
+ )
22
+
23
+
24
+ def draw_detection_boxes(
25
+ image: Image.Image,
26
+ detections: List[Dict],
27
+ box_width: int = 3,
28
+ font_size: int = 12,
29
+ show_text: bool = True,
30
+ merge_boxes: bool = True
31
+ ) -> Image.Image:
32
+ """
33
+ Draw text detection boxes with labels on image
34
+
35
+ Args:
36
+ image: PIL Image to draw on
37
+ detections: List of detection dicts with 'text', 'x1', 'y1', 'x2', 'y2'
38
+ box_width: Width of bounding box lines
39
+ font_size: Font size for text labels
40
+ show_text: Whether to show text labels
41
+ merge_boxes: Whether to merge close boxes (default: True)
42
+
43
+ Returns:
44
+ New image with boxes and labels drawn
45
+ """
46
+ # Merge detections if requested
47
+ if merge_boxes:
48
+ detections = merge_detections(detections)
49
+
50
+ # Create a copy of the image
51
+ img_draw = image.copy().convert('RGBA')
52
+
53
+ # Create transparent overlay for semi-transparent boxes
54
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
55
+ draw_overlay = ImageDraw.Draw(overlay)
56
+ draw = ImageDraw.Draw(img_draw)
57
+
58
+ # Try to load a better font that supports CJK (Chinese/Japanese/Korean)
59
+ # Prioritize local fonts folder for portability
60
+ font_paths = [
61
+ # Local fonts (project/fonts/) - Prioritize slim/light fonts
62
+ os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Light.ttc"),
63
+ os.path.join(os.path.dirname(__file__), "fonts", "NotoSansCJK-Regular.ttc"),
64
+ os.path.join(os.path.dirname(__file__), "fonts", "STHeiti-Light.ttc"),
65
+ # macOS fonts
66
+ "/System/Library/Fonts/STHeiti Light.ttc",
67
+ "/System/Library/Fonts/PingFang.ttc",
68
+ "/System/Library/Fonts/Hiragino Sans GB.ttc",
69
+ # Linux fonts
70
+ "/usr/share/fonts/truetype/noto/NotoSansCJK-Light.ttc",
71
+ "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
72
+ "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
73
+ ]
74
+
75
+ font = None
76
+ valid_font_path = None
77
+ for path in font_paths:
78
+ try:
79
+ font = ImageFont.truetype(path, font_size)
80
+ valid_font_path = path
81
+ break
82
+ except (IOError, OSError):
83
+ continue
84
+
85
+ if font is None:
86
+ # Fallback to default if no custom font loaded
87
+ font = ImageFont.load_default()
88
+
89
+ # Draw each detection
90
+ for i, detection in enumerate(detections, 1):
91
+ try:
92
+ text = detection['text']
93
+ x1, y1 = detection['x1'], detection['y1']
94
+ x2, y2 = detection['x2'], detection['y2']
95
+
96
+ # Calculate box dimensions
97
+ box_w = x2 - x1
98
+ box_h = y2 - y1
99
+
100
+ # Helper function to wrap text and calculate size
101
+ def get_text_layout(text, font, max_width):
102
+ lines = []
103
+ raw_lines = text.split('\n')
104
+ for raw_line in raw_lines:
105
+ current_line = ""
106
+ for char in raw_line:
107
+ test_line = current_line + char
108
+ bbox = draw.textbbox((0, 0), test_line, font=font)
109
+ if bbox[2] - bbox[0] < max_width:
110
+ current_line = test_line
111
+ else:
112
+ if current_line:
113
+ lines.append(current_line)
114
+ current_line = char
115
+ if current_line:
116
+ lines.append(current_line)
117
+
118
+ # Calculate total height
119
+ if not lines:
120
+ return [], 0, 0
121
+
122
+ # Get line height from font metrics
123
+ ascent, descent = font.getmetrics()
124
+ line_height = ascent + descent
125
+ total_height = len(lines) * line_height * 1.2 # 1.2 line spacing
126
+
127
+ max_line_w = 0
128
+ for line in lines:
129
+ bbox = draw.textbbox((0, 0), line, font=font)
130
+ max_line_w = max(max_line_w, bbox[2] - bbox[0])
131
+
132
+ return lines, total_height, max_line_w
133
+
134
+ # Iteratively find best font size
135
+ min_font_size = 12
136
+ max_font_size = 60
137
+ best_font_size = min_font_size
138
+ best_lines = []
139
+
140
+ # Try sizes from max to min
141
+ for size in range(max_font_size, min_font_size - 1, -2):
142
+ try:
143
+ if valid_font_path:
144
+ test_font = ImageFont.truetype(valid_font_path, size)
145
+ else:
146
+ test_font = ImageFont.load_default()
147
+ except:
148
+ test_font = ImageFont.load_default()
149
+
150
+ lines, total_h, max_w = get_text_layout(text, test_font, box_w - 8) # 8px padding
151
+
152
+ if total_h <= box_h - 4 and max_w <= box_w - 4:
153
+ best_font_size = size
154
+ best_lines = lines
155
+ break
156
+
157
+ # If even min size doesn't fit, calculate required box expansion
158
+ font_size_to_use = best_font_size
159
+ try:
160
+ if valid_font_path:
161
+ font_to_use = ImageFont.truetype(valid_font_path, font_size_to_use)
162
+ else:
163
+ font_to_use = ImageFont.load_default()
164
+ except:
165
+ font_to_use = ImageFont.load_default()
166
+
167
+ # Calculate max allowed dimensions (max 20% larger)
168
+ max_allowed_w = int(box_w * 1.2)
169
+ max_allowed_h = int(box_h * 1.2)
170
+
171
+ # Try layout with max allowed width to minimize height
172
+ # Use -8 for padding (4px left, 4px right)
173
+ lines, total_h, max_line_w = get_text_layout(text, font_to_use, max_allowed_w - 8)
174
+
175
+ # Determine new dimensions, capped at 20% expansion
176
+ # We ensure we don't shrink below original size
177
+ new_w = max(box_w, min(max_line_w + 8, max_allowed_w))
178
+ new_h = max(box_h, min(total_h + 4, max_allowed_h))
179
+
180
+ # Update box coordinates
181
+ x2 = x1 + new_w
182
+ y2 = y1 + new_h
183
+ box_w = new_w
184
+ box_h = new_h
185
+
186
+ # 1. Cover original text with white background (using potentially expanded box)
187
+ draw.rectangle(
188
+ [x1, y1, x2, y2],
189
+ fill=(255, 255, 255),
190
+ outline=(0, 0, 0),
191
+ width=2
192
+ )
193
+
194
+ # 4. Draw text left-aligned horizontally and centered vertically
195
+ # Get metrics again for drawing
196
+ ascent, descent = font_to_use.getmetrics()
197
+ line_height = (ascent + descent) * 1.2
198
+
199
+ start_y = y1 + (box_h - total_h) / 2
200
+
201
+ for j, line in enumerate(lines):
202
+ # Left align with small padding
203
+ line_x = x1 + 4
204
+ line_y = start_y + j * line_height
205
+
206
+ draw.text((line_x, line_y), line, font=font_to_use, fill=(0, 0, 0))
207
+
208
+ except Exception as e:
209
+ print(f"Error drawing detection box: {str(e)}")
210
+ continue
211
+
212
+ except Exception as e:
213
+ print(f"Error drawing detection box: {str(e)}")
214
+ continue
215
+
216
+ # Composite the overlay onto the image
217
+ img_draw.paste(overlay, (0, 0), overlay)
218
+
219
+ # Convert back to RGB
220
+ return img_draw.convert('RGB')
221
+
222
+
223
+ def create_side_by_side_comparison(
224
+ original: Image.Image,
225
+ annotated: Image.Image,
226
+ spacing: int = 20
227
+ ) -> Image.Image:
228
+ """
229
+ Create side-by-side comparison of original and annotated images
230
+
231
+ Args:
232
+ original: Original image
233
+ annotated: Annotated image with boxes
234
+ spacing: Space between images in pixels
235
+
236
+ Returns:
237
+ Combined image showing both versions
238
+ """
239
+ # Get dimensions
240
+ width1, height1 = original.size
241
+ width2, height2 = annotated.size
242
+
243
+ # Create new image
244
+ total_width = width1 + width2 + spacing
245
+ total_height = max(height1, height2)
246
+
247
+ combined = Image.new('RGB', (total_width, total_height), (255, 255, 255))
248
+
249
+ # Paste images
250
+ combined.paste(original, (0, 0))
251
+ combined.paste(annotated, (width1 + spacing, 0))
252
+
253
+ # Add labels
254
+ draw = ImageDraw.Draw(combined)
255
+
256
+ # Try to load a better font that supports CJK
257
+ font_paths = [
258
+ "/System/Library/Fonts/PingFang.ttc",
259
+ "/System/Library/Fonts/Hiragino Sans GB.ttc",
260
+ "/System/Library/Fonts/STHeiti Light.ttc",
261
+ "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
262
+ "/System/Library/Fonts/Supplemental/Arial.ttf",
263
+ "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
264
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
265
+ ]
266
+
267
+ font = None
268
+ for path in font_paths:
269
+ try:
270
+ font = ImageFont.truetype(path, 24)
271
+ break
272
+ except (IOError, OSError):
273
+ continue
274
+
275
+ if font is None:
276
+ font = ImageFont.load_default()
277
+
278
+ draw.text((10, 10), "Original", font=font, fill=(0, 0, 0))
279
+ draw.text((width1 + spacing + 10, 10), "Detected Text", font=font, fill=(0, 0, 0))
280
+
281
+ return combined
282
+
283
+
284
+ def get_detection_summary(detections: List[Dict]) -> str:
285
+ """
286
+ Create a text summary of detection results
287
+
288
+ Args:
289
+ detections: List of detection dictionaries
290
+
291
+ Returns:
292
+ Formatted summary string
293
+ """
294
+ if not detections:
295
+ return "No text detected in the image."
296
+
297
+ summary = f"Detected {len(detections)} text region(s):\n\n"
298
+
299
+ for i, det in enumerate(detections, 1):
300
+ if 'original_text' in det and det['original_text'] != det['text']:
301
+ summary += f"{i}. Original: \"{det['original_text']}\"\n"
302
+ summary += f" Translated: \"{det['text']}\"\n"
303
+ else:
304
+ summary += f"{i}. \"{det['text']}\"\n"
305
+ summary += f" Location: ({det['x1']}, {det['y1']}) → ({det['x2']}, {det['y2']})\n\n"
306
+
307
+ return summary
308
+
309
+
310
+ def merge_detections(detections: List[Dict], threshold: int = 50) -> List[Dict]:
311
+ """
312
+ Merge close detection boxes into single boxes
313
+
314
+ Args:
315
+ detections: List of detection dicts
316
+ threshold: Distance threshold for merging
317
+
318
+ Returns:
319
+ List of merged detection dicts
320
+ """
321
+ if not detections:
322
+ return []
323
+
324
+ # Helper to check if two boxes are close
325
+ def are_close(box1, box2, thresh):
326
+ # Expand box1 by thresh
327
+ b1_x1, b1_y1 = box1['x1'] - thresh, box1['y1'] - thresh
328
+ b1_x2, b1_y2 = box1['x2'] + thresh, box1['y2'] + thresh
329
+
330
+ # Check overlap with box2
331
+ return not (b1_x2 < box2['x1'] or b1_x1 > box2['x2'] or
332
+ b1_y2 < box2['y1'] or b1_y1 > box2['y2'])
333
+
334
+ # Build adjacency list
335
+ n = len(detections)
336
+ adj = [[] for _ in range(n)]
337
+ for i in range(n):
338
+ for j in range(i + 1, n):
339
+ if are_close(detections[i], detections[j], threshold):
340
+ adj[i].append(j)
341
+ adj[j].append(i)
342
+
343
+ # Find connected components
344
+ visited = [False] * n
345
+ merged_results = []
346
+
347
+ for i in range(n):
348
+ if not visited[i]:
349
+ # BFS to find component
350
+ component = []
351
+ stack = [i]
352
+ visited[i] = True
353
+ while stack:
354
+ curr = stack.pop()
355
+ component.append(detections[curr])
356
+ for neighbor in adj[curr]:
357
+ if not visited[neighbor]:
358
+ visited[neighbor] = True
359
+ stack.append(neighbor)
360
+
361
+ # Merge component
362
+ if not component:
363
+ continue
364
+
365
+ # Calculate merged bounds
366
+ min_x1 = min(d['x1'] for d in component)
367
+ min_y1 = min(d['y1'] for d in component)
368
+ max_x2 = max(d['x2'] for d in component)
369
+ max_y2 = max(d['y2'] for d in component)
370
+
371
+ # Sort texts: Right-to-Left (descending X), then Top-to-Bottom (ascending Y)
372
+ # This is standard for Manga reading order
373
+ component.sort(key=lambda d: (-d['x1'], d['y1']))
374
+
375
+ merged_text = "".join(d['text'] for d in component).replace(" ", "")
376
+
377
+ merged_results.append({
378
+ 'text': merged_text,
379
+ 'x1': min_x1,
380
+ 'y1': min_y1,
381
+ 'x2': max_x2,
382
+ 'y2': max_y2
383
+ })
384
+
385
+ return merged_results