saifisvibin commited on
Commit
b422a66
·
verified ·
1 Parent(s): b3a8322

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +20 -0
main.py CHANGED
@@ -543,6 +543,26 @@ def write_markdown_document(pdf_path: Path, out_dir: Path) -> Optional[Path]:
543
  logger.warning(f" No textual content extracted from {pdf_path.name}")
544
  return None
545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  if not markdown_content.endswith("\n"):
547
  markdown_content += "\n"
548
 
 
543
  logger.warning(f" No textual content extracted from {pdf_path.name}")
544
  return None
545
 
546
+ # --- FORMATTING POST-PROCESSING ---
547
+ import re
548
+ # 0. Normalize Windows CRLF to Unix LF so regex works correctly
549
+ markdown_content = markdown_content.replace('\r\n', '\n').replace('\r', '\n')
550
+
551
+ # 1. Ensure headers have a blank line before and after them
552
+ markdown_content = re.sub(r'^(#{1,6}\s+.*)$', r'\n\n\1\n\n', markdown_content, flags=re.MULTILINE)
553
+
554
+ # 2. Fix arbitrary hard line breaks in paragraphs inside sentences
555
+ markdown_content = re.sub(r'([a-z,:\-])\n([a-zA-Z])', r'\1 \2', markdown_content)
556
+
557
+ # 3. Ensure double newlines between sentences across lines
558
+ markdown_content = re.sub(r'([.?!])\n([A-Z])', r'\1\n\n\2', markdown_content)
559
+
560
+ # 4. Clean up excessive newlines
561
+ markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
562
+
563
+ markdown_content = markdown_content.strip()
564
+ # ----------------------------------
565
+
566
  if not markdown_content.endswith("\n"):
567
  markdown_content += "\n"
568