speech_recognition
prepare.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 """This module has been written to prepare the files and directory structure
3 required by Kaldi-ASR to develop Acoustic Models.
4 
5 Structure of the CSV file:
6 
7 -----------------------------------------------------------------
8 | SPEAKER_ID | UTTERANCE_ID | WAV_PATH | TRANSCRIPTION | GENDER |
9 -----------------------------------------------------------------
10 
11 TODO:
12 1. Check correctness of file endings for each created data file
13 2. Analyse the best suited format for value of fields SPEAKER_ID and UTTERANCE_ID.
14  Change the following methods accordingly:
15  a. text
16  b. wav.scp
17  c. spk2utt
18  d. utt2spk
19 3. Analyse the best suited naming structure for each FILE
20 """
21 
22 import os
23 import sys
24 import csv
25 import shutil
26 
27 class Flags:
28  """Class Flags takes no input arguments and initializes all flags to False"""
29  def __init__(self):
30  self.csv_check = False
31  self.csv_header = False
32  self.wav_path = False
33  self.transcription = False
34  self.gender = False
35  self.utterance_id = False
36  self.speaker_id = False
37 
38 class Index:
39  """Class Index has been defined to maintain the indices of the CSV file."""
40  def __init__(self):
41  self.wav_path = None
42  self.transcription = None
43  self.gender = None
44  self.speaker_id = None
45  self.utterance_id = None
46 
47 
49  """Class DataPreparation takes 2 input arguments
50 
51  Args:
52  param1 (str): Absolute path of the CSV metadata file
53  param2 (str): Absolute path of the root directory of Speech Recognition System
54  param3 (str): Type of Dataset
55 
56  """
57  def __init__(self, csv_path, srs_path, dataset):
58  """ DataPreparation class constructor """
59 
60  file_extension = csv_path.split('.')[-1]
61  if file_extension != 'csv':
62  sys.exit("File type not '.csv'")
63 
64  if not os.path.exists(csv_path):
65  sys.exit("Invalid CSV path")
66 
67  self.csv_path = csv_path
68  self.csv_data_root = os.path.dirname(csv_path)
69 
70  self.flag = Flags()
71  self.index = Index()
72 
73  self.csv_check()
74 
75  # Return the exact error
76  if not self.flag.csv_check:
77  sys.exit("CSV check failed")
78 
79  if srs_path[-1] == '/':
80  srs_path = srs_path[:-1]
81 
82  if not os.path.exists(srs_path):
83  sys.exit("Invalid path to Speech Recognition System root")
84 
85  # This section maybe buggy
86  self.srs_path = srs_path
87  self.srs_path_data = self.srs_path + "/data"
88  self.srs_path_data_dataset = self.srs_path_data + "/" + dataset
89 
90  if os.path.exists(self.srs_path_data):
91  data_dir_contents = os.listdir(self.srs_path_data)
92  if data_dir_contents == []:
93  os.mkdir(self.srs_path_data_dataset)
94  else:
95  data_dir_contents.remove('test')
96  for directory in data_dir_contents:
97  shutil.rmtree(self.srs_path_data + "/" + directory)
98  os.mkdir(self.srs_path_data_dataset)
99  else:
100  os.mkdir(self.srs_path_data)
101  os.mkdir(self.srs_path_data_dataset)
102 
103  self.text()
104  self.spk2gender()
105  self.wavscp()
106  self.spk2utt()
107 
108  def csv_check(self):
109  """csvCheck performs the preliminary checks before the construction of
110  the required data files.
111 
112  The following flow has been established:
113  1. Read the CSV file to find the delimiter
114  2. Read the Header of the CSV file
115  3. Check whether the Header contains fields and store their indices:
116  a. SPEAKER_ID
117  b. WAV_PATH (relative to the CSV file)
118  c. transcription
119  d. GENDER
120  e. UTTERANCE_ID
121  4. Read every row of the CSV file and check if all the wav paths exist
122  5. Check whether number of wav files and transcriptions are equal
123  """
124  with open(self.csv_path, 'r') as csv_file:
125  header = csv_file.readline()
126  sniffer = csv.Sniffer()
127  dialect = sniffer.sniff(header)
128  self.csv_delimiter = dialect.delimiter
129 
130  with open(self.csv_path, 'r') as csv_file:
131  csv_reader = csv.reader(csv_file, delimiter=self.csv_delimiter)
132  csv_header = next(csv_reader)
133 
134  try:
135  self.index.wav_path = csv_header.index("WAV_PATH")
136  self.index.transcription = csv_header.index("TRANSCRIPTION")
137  self.index.gender = csv_header.index("GENDER")
138  self.index.speaker_id = csv_header.index("SPEAKER_ID")
139  self.index.utterance_id = csv_header.index("UTTERANCE_ID")
140 
141  except:
142  self.flag.csv_header = False
143  self.flag.csv_check = False
144  return
145  else:
146  self.flag.csv_header = True
147 
148  wav_files = 0
149  transcriptions = 0
150  gender_count = 0
151  speaker_count = 0
152  utterance_count = 0
153 
154  for row in csv_reader:
155  wav_rel_path = row[self.index.wav_path]
156  wav_path = self.csv_data_root + '/' + wav_rel_path
157  if not os.path.exists(wav_path):
158  self.flag.wav_path = False
159  self.flag.csv_check = False
160  return
161  wav_files += 1
162 
163  if row[self.index.transcription] != '':
164  transcriptions += 1
165 
166  if row[self.index.gender] != '':
167  gender_count += 1
168 
169  if row[self.index.speaker_id] != '':
170  speaker_count += 1
171 
172  if row[self.index.utterance_id] != '':
173  utterance_count += 1
174 
175  self.flag.wav_path = True
176 
177  self.total_files = wav_files
178 
179  if transcriptions != self.total_files:
180  self.flag.transcription = False
181  self.flag.csv_check = False
182  return
183  else:
184  self.flag.transcription = True
185 
186  if gender_count != self.total_files:
187  self.flag.gender = False
188  self.flag.csv_check = False
189  return
190  else:
191  self.flag.gender = True
192 
193  if speaker_count != self.total_files:
194  self.flag.speaker_id = False
195  self.flag.csv_check = False
196  return
197  else:
198  self.flag.speaker_id = True
199 
200  if utterance_count != self.total_files:
201  self.flag.utterance_id = False
202  self.flag.csv_check = False
203  return
204  else:
205  self.flag.utterance_id = True
206 
207  self.flag.csv_check = True
208 
209  def text(self):
210  """text prepares the file 'text' in the DATASET directory.
211 
212  The following flow has been established:
213  1. Read the CSV file
214  2. From each row extract:
215  a. SPEAKER_ID
216  b. UTTERANCE_ID
217  c. TRANSCRIPTION
218  3. Make file id "<SPEAKER_ID>U<UTTERANCE_ID>"
219  4. Write an output file where each line has the structure:
220  <FILE_ID><Tab_space><TRANSCRIPTION>
221  """
222  with open(self.csv_path, 'r') as csv_file:
223  csv_reader = csv.reader(csv_file, delimiter=self.csv_delimiter)
224  row = next(csv_reader)
225 
226  iterations = self.total_files
227  out = ''
228 
229  while iterations:
230  row = next(csv_reader)
231 
232  sid = row[self.index.speaker_id]
233  uid = row[self.index.utterance_id]
234  transcription = row[self.index.transcription]
235 
236  fid = sid + 'U' + str(uid).zfill(5)
237 
238  out += fid + '\t' + transcription + '\n'
239  iterations -= 1
240 
241  # The output file must not end with a newline
242  out = out[:-1]
243  with open(self.srs_path_data_dataset + '/text', 'w') as out_file:
244  out_file.write(out)
245 
246  def spk2gender(self):
247  """spk2gender prepares the file 'spk2gender' in the DATASET directory.
248 
249  The following flow has been established:
250  1. Read the CSV file
251  2. Read the first row (First Speaker) and extract the gender details
252  3. Search for a new speaker and extract its gender details
253  4. Write an output file where each line has the structure:
254  <SPEAKER_ID><Tab_space><GENDER>
255  """
256  with open(self.csv_path, 'r') as csv_file:
257  csv_reader = csv.reader(csv_file, delimiter=self.csv_delimiter)
258  row = next(csv_reader)
259 
260  iterations = self.total_files
261  out = ''
262 
263  # First speaker is not a new speaker
264  row = next(csv_reader)
265  sid = row[self.index.speaker_id]
266  gender = row[self.index.gender]
267  out += sid + '\t' + gender + '\n'
268  iterations -= 1
269  new_speaker = False
270 
271  while iterations:
272  if new_speaker:
273 
274  sid = row[self.index.speaker_id]
275  gender = row[self.index.gender]
276 
277  out += sid + '\t' + gender + '\n'
278  iterations -= 1
279  new_speaker = False
280 
281  else:
282  row = next(csv_reader)
283 
284  if row[self.index.speaker_id] == sid:
285  iterations -= 1
286  continue
287  else:
288  new_speaker = True
289 
290  # The output file must not end with a newline
291  out = out[:-1]
292  with open(self.srs_path_data_dataset + '/spk2gender', 'w') as out_file:
293  out_file.write(out)
294 
295  def wavscp(self):
296  """wavscp prepares the file 'wav.scp' in the DATASET directory.
297 
298  The following flow has been established:
299  1. Read the CSV file
300  2. From each row extract:
301  a. SPEAKER_ID
302  b. UTTERANCE_ID
303  c. WAV_PATH (relative to the CSV file)
304  3. Make FILE_ID "<SPEAKER_ID>U<UTTERANCE_ID>"
305  4. Make FILE_PATH "<CSV_DATA_ROOT>/<WAV_PATH>"
306  5. Write an output file where each line has the structure:
307  <FILE_ID><Tab_space><FILE_PATH>
308  """
309  with open(self.csv_path, 'r') as csv_file:
310  csv_reader = csv.reader(csv_file, delimiter=self.csv_delimiter)
311  row = next(csv_reader)
312 
313  iterations = self.total_files
314  out = ''
315 
316  while iterations:
317  row = next(csv_reader)
318 
319  sid = row[self.index.speaker_id]
320  uid = row[self.index.utterance_id]
321  wav_rel_path = row[self.index.wav_path]
322 
323  fid = sid + 'U' + str(uid).zfill(5)
324  fpath = self.csv_data_root + '/' + wav_rel_path
325 
326  out += fid + '\t' + fpath + '\n'
327  iterations -= 1
328 
329  # The output file must not end with a newline
330  out = out[:-1]
331  with open(self.srs_path_data_dataset + '/wav.scp', 'w') as out_file:
332  out_file.write(out)
333 
334  def utt2spk(self):
335  """utt2spk prepares the file 'utt2spk' in the DATASET directory.
336 
337  The following flow has been established:
338  1. Read the CSV file
339  2. From each row extract:
340  a. SPEAKER_ID
341  b. UTTERANCE_ID
342  3. Make FILE_ID "<SPEAKER_ID>U<UTTERANCE_ID>"
343  4. Write an output file where each line has the structure:
344  <FILE_ID><Tab_space><SPEAKER_ID>
345  """
346  with open(self.csv_path, 'r') as csv_file:
347  csv_reader = csv.reader(csv_file, delimiter=self.csv_delimiter)
348  row = next(csv_reader)
349 
350  iterations = self.total_files
351  out = ''
352 
353  while iterations:
354  row = next(csv_reader)
355 
356  sid = row[self.index.speaker_id]
357  uid = row[self.index.utterance_id]
358 
359  fid = sid + 'U' + str(uid).zfill(5)
360 
361  out += fid + '\t' + sid + '\n'
362  iterations -= 1
363 
364  # The output file must not end with a newline
365  out = out[:-1]
366  with open(self.srs_path_data_dataset + '/utt2spk', 'w') as out_file:
367  out_file.write(out)
368 
369  def spk2utt(self):
370  """spk2utt prepares the file 'spk2utt' in the DATASET directory.
371 
372  The following flow has been established:
373  1. Read 'utt2spk' from the DATASET directory (if missing, create it)
374  2. From each row extract:
375  a. FILE_ID
376  b. SPEAKER_ID
377  3. Write an output file where each line has the structure:
378  <SPEAKER_ID> <FILE_ID_1> <FILE_ID_2> ... <FILE_ID_END>
379  """
380  if not os.path.exists(self.srs_path_data_dataset + "/utt2spk"):
381  self.utt2spk()
382 
383  with open(self.srs_path_data_dataset + '/utt2spk', 'r') as utt2spk:
384  iterations = self.total_files
385  out = ''
386 
387  # First speaker is not a new speaker
388  row = next(utt2spk).split()
389  fid = row[0]
390  sid = row[1]
391  out += sid + ' ' + fid
392  iterations -= 1
393  new_speaker = False
394 
395  while iterations:
396  if new_speaker:
397  out += '\n' + sid + ' ' + fid
398  new_speaker = False
399  iterations -= 1
400 
401  else:
402  row = next(utt2spk).split()
403  if row[1] == sid:
404  fid = row[0]
405  sid = row[1]
406  out += ' ' + fid
407  iterations -= 1
408 
409  else:
410  new_speaker = True
411 
412  # The output file must not end with a newline
413  # out = out[:-1]
414  with open(self.srs_path_data_dataset + '/spk2utt', 'w') as out_file:
415  out_file.write(out)
416 
417 def main():
418  """When script is executed as __main__:
419 
420  Args:
421  param1 (str): Absolute path of the CSV metadata file
422  param2 (str): Absolute path of the root directory of Speech Recognition System
423  param3 (str): Type of Dataset
424  """
425  if not len(sys.argv) == 4:
426  sys.exit("Not enough input arguments")
427 
428  else:
429  csv_path = sys.argv[1]
430  srs_path = sys.argv[2]
431  dataset = sys.argv[3]
432 
433  DataPreparation(csv_path, srs_path, dataset)
434 
435  # Add the main() subroutine
436 if __name__ == '__main__':
437  main()
prepare.Index.gender
gender
Definition: prepare.py:43
prepare.DataPreparation.total_files
total_files
Definition: prepare.py:177
prepare.Flags.transcription
transcription
Definition: prepare.py:33
prepare.Flags.csv_header
csv_header
Definition: prepare.py:31
prepare.DataPreparation.text
def text(self)
Definition: prepare.py:209
prepare.Index.wav_path
wav_path
Definition: prepare.py:41
prepare.Index.__init__
def __init__(self)
Definition: prepare.py:40
prepare.DataPreparation.csv_path
csv_path
Definition: prepare.py:67
prepare.Index.transcription
transcription
Definition: prepare.py:42
prepare.Flags.gender
gender
Definition: prepare.py:34
prepare.DataPreparation.utt2spk
def utt2spk(self)
Definition: prepare.py:334
prepare.DataPreparation.spk2gender
def spk2gender(self)
Definition: prepare.py:246
prepare.DataPreparation.csv_delimiter
csv_delimiter
Definition: prepare.py:128
prepare.DataPreparation.spk2utt
def spk2utt(self)
Definition: prepare.py:369
prepare.DataPreparation.srs_path
srs_path
Definition: prepare.py:86
prepare.Flags
Definition: prepare.py:27
prepare.DataPreparation.__init__
def __init__(self, csv_path, srs_path, dataset)
Definition: prepare.py:57
prepare.DataPreparation.csv_check
def csv_check(self)
Definition: prepare.py:108
prepare.DataPreparation.index
index
Definition: prepare.py:71
prepare.Flags.speaker_id
speaker_id
Definition: prepare.py:36
prepare.DataPreparation.csv_data_root
csv_data_root
Definition: prepare.py:68
prepare.Flags.__init__
def __init__(self)
Definition: prepare.py:29
prepare.Flags.wav_path
wav_path
Definition: prepare.py:32
prepare.DataPreparation.wavscp
def wavscp(self)
Definition: prepare.py:295
prepare.DataPreparation
Definition: prepare.py:48
prepare.main
def main()
Definition: prepare.py:417
prepare.Flags.utterance_id
utterance_id
Definition: prepare.py:35
prepare.DataPreparation.flag
flag
Definition: prepare.py:70
prepare.Index.utterance_id
utterance_id
Definition: prepare.py:45
prepare.DataPreparation.srs_path_data_dataset
srs_path_data_dataset
Definition: prepare.py:88
prepare.Index
Definition: prepare.py:38
prepare.Flags.csv_check
csv_check
Definition: prepare.py:30
prepare.DataPreparation.srs_path_data
srs_path_data
Definition: prepare.py:87
prepare.Index.speaker_id
speaker_id
Definition: prepare.py:44