Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -543,6 +543,26 @@ def write_markdown_document(pdf_path: Path, out_dir: Path) -> Optional[Path]:
|
|
| 543 |
logger.warning(f" No textual content extracted from {pdf_path.name}")
|
| 544 |
return None
|
| 545 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
if not markdown_content.endswith("\n"):
|
| 547 |
markdown_content += "\n"
|
| 548 |
|
|
|
|
| 543 |
logger.warning(f" No textual content extracted from {pdf_path.name}")
|
| 544 |
return None
|
| 545 |
|
| 546 |
+
# --- FORMATTING POST-PROCESSING ---
|
| 547 |
+
import re
|
| 548 |
+
# 0. Normalize Windows CRLF to Unix LF so regex works correctly
|
| 549 |
+
markdown_content = markdown_content.replace('\r\n', '\n').replace('\r', '\n')
|
| 550 |
+
|
| 551 |
+
# 1. Ensure headers have a blank line before and after them
|
| 552 |
+
markdown_content = re.sub(r'^(#{1,6}\s+.*)$', r'\n\n\1\n\n', markdown_content, flags=re.MULTILINE)
|
| 553 |
+
|
| 554 |
+
# 2. Fix arbitrary hard line breaks in paragraphs inside sentences
|
| 555 |
+
markdown_content = re.sub(r'([a-z,:\-])\n([a-zA-Z])', r'\1 \2', markdown_content)
|
| 556 |
+
|
| 557 |
+
# 3. Ensure double newlines between sentences across lines
|
| 558 |
+
markdown_content = re.sub(r'([.?!])\n([A-Z])', r'\1\n\n\2', markdown_content)
|
| 559 |
+
|
| 560 |
+
# 4. Clean up excessive newlines
|
| 561 |
+
markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
|
| 562 |
+
|
| 563 |
+
markdown_content = markdown_content.strip()
|
| 564 |
+
# ----------------------------------
|
| 565 |
+
|
| 566 |
if not markdown_content.endswith("\n"):
|
| 567 |
markdown_content += "\n"
|
| 568 |
|