3okasha commited on
Commit
42d9280
·
verified ·
1 Parent(s): 07f6de6

Create generate_dataset.py

Browse files
Files changed (1) hide show
  1. generate_dataset.py +204 -0
generate_dataset.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """CollectParaphrizingData.ipynb
3
+ """
4
+
5
+ from seamless_communication.models.inference import Translator
6
+
7
+ import pickle
8
+ from tqdm import tqdm
9
+
10
+ import os
11
+ import torch
12
+ import urllib.request
13
+ import zipfile
14
+ import xml.etree.ElementTree as ET
15
+
16
+ import time
17
+ import re
18
+ from pyarabic.araby import normalize_hamza, strip_tatweel, strip_tashkeel
19
+ #from IPython.core.display import display, HTML
20
+
21
+ from ghalatawi.autocorrector import AutoCorrector
22
+ from ghalatawi.ar_ghalat import isArabicword
23
+ from ghalatawi.ghalat_const import ReplacementTablePount
24
+ import naftawayh.wordtag as wordtag
25
+ import hunspell
26
+
27
+ # Download and unzip the file only if it doesn't exist
28
+ zip_file_path = 'ar.zip'
29
+ unzip_folder = 'OpenSubtitles'
30
+
31
+ if not os.path.exists(unzip_folder):
32
+ if not os.path.exists(zip_file_path):
33
+ print("Downloading...")
34
+ urllib.request.urlretrieve('https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/raw/ar.zip', zip_file_path)
35
+
36
+ print("Unzipping...")
37
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
38
+ zip_ref.extractall(unzip_folder)
39
+ print("Unzipped.")
40
+ else:
41
+ print(f"{unzip_folder} already exists. Skipping download and unzip steps.")
42
+
43
+ # Function to extract Arabic sentences from an XML file
44
+ def get_arabic_sentences_from_xml(xml_file_path):
45
+ tree = ET.parse(xml_file_path)
46
+ root = tree.getroot()
47
+ sentences = []
48
+ for sentence in root.iter('s'):
49
+ arabic_text = "".join(sentence.itertext()).strip()
50
+ if arabic_text:
51
+ sentences.append(arabic_text)
52
+ return sentences
53
+
54
+
55
+
56
+ translator = Translator("seamlessM4T_large", vocoder_name_or_card="vocoder_36langs", device=torch.device("cpu"), dtype=torch.float64)
57
+
58
+ def generate_paraphrizing_texts(text):
59
+ data = []
60
+ for lang_code in ['eng', 'fra', 'cmn','spa', 'rus']:
61
+ # ترجمة النص إلى لغة أخرى
62
+ translated_text, _, _ = translator.predict(text, 't2tt', lang_code, src_lang='arb')
63
+ # إعادة ترجمة النص إلى العربية
64
+ retranslated_text, _, _ = translator.predict(translated_text, 't2tt', 'arb', src_lang=lang_code)
65
+ data.append(retranslated_text)
66
+ return data
67
+
68
+
69
+
70
+
71
+
72
+ autoco = AutoCorrector()
73
+ tagger = wordtag.WordTagger();
74
+ hobj = hunspell.HunSpell('/usr/share/hunspell/ar.dic', '/usr/share/hunspell/ar.aff')
75
+
76
+ def normalize_text(text):
77
+ # الحفاظ على الأحرف الأبجدية العربية، الأرقام، المسافات، وبعض علامات الترقيم المعترف بها في اللغة العربية
78
+ arabic_text_pattern = r'[^ء-ي0-9،؛؟.!\s]'
79
+ cleaned_text = re.sub(arabic_text_pattern, '', text)
80
+ cleaned_text = cleaned_text.replace('...', '').replace('..', '') # remove shortcut sympole
81
+ return normalize_hamza(strip_tatweel(strip_tashkeel(cleaned_text)), method="tasheel")
82
+
83
+ def contains_arabic(text):
84
+ pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]'
85
+ if re.search(pattern, text):
86
+ return True
87
+ return False
88
+
89
+ def spell_ghalatawi(text):
90
+ return autoco.spell(text)
91
+
92
+ def spell_hunspell(text):
93
+ words = text.split()
94
+
95
+ corrected_words = []
96
+
97
+ for word in words:
98
+ # Check if the word matches any of the regular expressions in the list
99
+ if any(pattern.search(word) for (pattern,replacment) in ReplacementTablePount):
100
+ corrected_words.append(word)
101
+ continue
102
+
103
+ # Perform the original spell checking if the word doesn't match any patterns
104
+ if isArabicword(word) and not hobj.spell(word) and hobj.suggest(word):
105
+ corrected_words.append(hobj.suggest(word)[0])
106
+ else:
107
+ corrected_words.append(word)
108
+
109
+ return ' '.join(corrected_words)
110
+
111
+ def print_html(text):
112
+ print(text)
113
+
114
+
115
+
116
+ if os.path.exists('completed_folders.pkl'):
117
+ with open('completed_folders.pkl', 'rb') as f:
118
+ completed_folders = pickle.load(f)
119
+ else:
120
+ completed_folders = []
121
+
122
+ file_count=0
123
+
124
+
125
+ def load_completed_folders(file_path):
126
+ if os.path.exists(file_path):
127
+ with open(file_path, 'rb') as f:
128
+ return pickle.load(f)
129
+ return []
130
+
131
+ # Initialize the buffer
132
+ buffered_data = []
133
+
134
+ # Set buffer size
135
+ BUFFER_SIZE = 100
136
+
137
+ base_folder = 'OpenSubtitles/OpenSubtitles/raw/ar'
138
+ completed_folders = load_completed_folders('completed_folders.pkl')
139
+
140
+ year_folders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]
141
+
142
+ # Loop with counter for year folders
143
+ for index, year_folder in enumerate(year_folders, start=1):
144
+ year_path = os.path.join(base_folder, year_folder)
145
+ print(f"Processing year folder {index}/{len(year_folders)}: {year_folder}")
146
+
147
+ subfolders_list = [sf for sf in os.listdir(year_path) if os.path.isdir(os.path.join(year_path, sf))]
148
+
149
+ # tqdm progress bar for subfolders
150
+ for sub_folder in tqdm(subfolders_list, desc="Subfolders", ncols=100):
151
+ if sub_folder in completed_folders:
152
+ continue
153
+
154
+ sub_folder_path = os.path.join(year_path, sub_folder)
155
+
156
+ for xml_file in os.listdir(sub_folder_path):
157
+ if xml_file.endswith('.xml'):
158
+ xml_file_path = os.path.join(sub_folder_path, xml_file)
159
+
160
+ arabic_sentences = get_arabic_sentences_from_xml(xml_file_path)
161
+ arabic_sentences = arabic_sentences[3:] # Skip introduction
162
+
163
+ for sentence in arabic_sentences:
164
+ if len(sentence) < 5 or not contains_arabic(sentence):
165
+ continue
166
+
167
+ paraphrased_texts = []
168
+ for paraphrasing_object in generate_paraphrizing_texts(normalize_text(sentence)):
169
+ try:
170
+ paraphrasing_string = paraphrasing_object.bytes().decode('utf-8')
171
+
172
+ if paraphrasing_string in paraphrased_texts:
173
+ continue
174
+ paraphrased_texts.append(paraphrasing_string)
175
+
176
+ ghalatawi_spell = spell_ghalatawi(paraphrasing_string)
177
+ if paraphrasing_string != ghalatawi_spell:
178
+ paraphrasing_string = ghalatawi_spell
179
+
180
+ hunspell_spell = spell_hunspell(paraphrasing_string)
181
+ if paraphrasing_string != hunspell_spell:
182
+ paraphrasing_string = hunspell_spell
183
+
184
+ buffered_data.append(f"{sentence}\t\t{paraphrasing_string}\n")
185
+
186
+
187
+ if len(buffered_data) >= BUFFER_SIZE:
188
+ with open("open_subtitle_para_en_.txt", "a") as f:
189
+ f.write("".join(buffered_data))
190
+ buffered_data.clear()
191
+
192
+ except:
193
+ print("\nAn exception occurred")
194
+
195
+
196
+ completed_folders.append(sub_folder)
197
+ print(f"\nCompleted subfolder: {sub_folder}")
198
+ with open('completed_folders.pkl', 'wb') as f:
199
+ pickle.dump(completed_folders, f)
200
+
201
+ # Write remaining buffered data to disk if any
202
+ if buffered_data:
203
+ with open("open_subtitle_para_ar_.txt", "a") as f:
204
+ f.write("".join(buffered_data))