2 """This module has been written to prepare the files and directory structure
3 required by Kaldi-ASR to develop Acoustic Models.
5 Structure of the CSV file:
7 -----------------------------------------------------------------
8 | SPEAKER_ID | UTTERANCE_ID | WAV_PATH | TRANSCRIPTION | GENDER |
9 -----------------------------------------------------------------
12 1. Check correctness of file endings for each created data file
13 2. Analyse the best suited format for value of fields SPEAKER_ID and UTTERANCE_ID.
14 Change the following methods accordingly:
19 3. Analyse the best suited naming structure for each FILE
28 """Class Flags takes no input arguments and initializes all flags to False"""
39 """Class Index has been defined to maintain the indices of the CSV file."""
49 """Class DataPreparation takes 2 input arguments
52 param1 (str): Absolute path of the CSV metadata file
53 param2 (str): Absolute path of the root directory of Speech Recognition System
54 param3 (str): Type of Dataset
57 def __init__(self, csv_path, srs_path, dataset):
58 """ DataPreparation class constructor """
60 file_extension = csv_path.split(
'.')[-1]
61 if file_extension !=
'csv':
62 sys.exit(
"File type not '.csv'")
64 if not os.path.exists(csv_path):
65 sys.exit(
"Invalid CSV path")
76 if not self.
flag.csv_check:
77 sys.exit(
"CSV check failed")
79 if srs_path[-1] ==
'/':
80 srs_path = srs_path[:-1]
82 if not os.path.exists(srs_path):
83 sys.exit(
"Invalid path to Speech Recognition System root")
92 if data_dir_contents == []:
95 data_dir_contents.remove(
'test')
96 for directory
in data_dir_contents:
109 """csvCheck performs the preliminary checks before the construction of
110 the required data files.
112 The following flow has been established:
113 1. Read the CSV file to find the delimiter
114 2. Read the Header of the CSV file
115 3. Check whether the Header contains fields and store their indices:
117 b. WAV_PATH (relative to the CSV file)
121 4. Read every row of the CSV file and check if all the wav paths exist
122 5. Check whether number of wav files and transcriptions are equal
124 with open(self.
csv_path,
'r')
as csv_file:
125 header = csv_file.readline()
126 sniffer = csv.Sniffer()
127 dialect = sniffer.sniff(header)
130 with open(self.
csv_path,
'r')
as csv_file:
131 csv_reader = csv.reader(csv_file, delimiter=self.
csv_delimiter)
132 csv_header = next(csv_reader)
135 self.
index.wav_path = csv_header.index(
"WAV_PATH")
136 self.
index.transcription = csv_header.index(
"TRANSCRIPTION")
137 self.
index.gender = csv_header.index(
"GENDER")
138 self.
index.speaker_id = csv_header.index(
"SPEAKER_ID")
139 self.
index.utterance_id = csv_header.index(
"UTTERANCE_ID")
142 self.
flag.csv_header =
False
143 self.
flag.csv_check =
False
146 self.
flag.csv_header =
True
154 for row
in csv_reader:
155 wav_rel_path = row[self.
index.wav_path]
157 if not os.path.exists(wav_path):
158 self.
flag.wav_path =
False
159 self.
flag.csv_check =
False
163 if row[self.
index.transcription] !=
'':
166 if row[self.
index.gender] !=
'':
169 if row[self.
index.speaker_id] !=
'':
172 if row[self.
index.utterance_id] !=
'':
175 self.
flag.wav_path =
True
180 self.
flag.transcription =
False
181 self.
flag.csv_check =
False
184 self.
flag.transcription =
True
187 self.
flag.gender =
False
188 self.
flag.csv_check =
False
191 self.
flag.gender =
True
194 self.
flag.speaker_id =
False
195 self.
flag.csv_check =
False
198 self.
flag.speaker_id =
True
201 self.
flag.utterance_id =
False
202 self.
flag.csv_check =
False
205 self.
flag.utterance_id =
True
207 self.
flag.csv_check =
True
210 """text prepares the file 'text' in the DATASET directory.
212 The following flow has been established:
214 2. From each row extract:
218 3. Make file id "<SPEAKER_ID>U<UTTERANCE_ID>"
219 4. Write an output file where each line has the structure:
220 <FILE_ID><Tab_space><TRANSCRIPTION>
222 with open(self.
csv_path,
'r')
as csv_file:
223 csv_reader = csv.reader(csv_file, delimiter=self.
csv_delimiter)
224 row = next(csv_reader)
230 row = next(csv_reader)
232 sid = row[self.
index.speaker_id]
233 uid = row[self.
index.utterance_id]
234 transcription = row[self.
index.transcription]
236 fid = sid +
'U' + str(uid).zfill(5)
238 out += fid +
'\t' + transcription +
'\n'
247 """spk2gender prepares the file 'spk2gender' in the DATASET directory.
249 The following flow has been established:
251 2. Read the first row (First Speaker) and extract the gender details
252 3. Search for a new speaker and extract its gender details
253 4. Write an output file where each line has the structure:
254 <SPEAKER_ID><Tab_space><GENDER>
256 with open(self.
csv_path,
'r')
as csv_file:
257 csv_reader = csv.reader(csv_file, delimiter=self.
csv_delimiter)
258 row = next(csv_reader)
264 row = next(csv_reader)
265 sid = row[self.
index.speaker_id]
266 gender = row[self.
index.gender]
267 out += sid +
'\t' + gender +
'\n'
274 sid = row[self.
index.speaker_id]
275 gender = row[self.
index.gender]
277 out += sid +
'\t' + gender +
'\n'
282 row = next(csv_reader)
284 if row[self.
index.speaker_id] == sid:
296 """wavscp prepares the file 'wav.scp' in the DATASET directory.
298 The following flow has been established:
300 2. From each row extract:
303 c. WAV_PATH (relative to the CSV file)
304 3. Make FILE_ID "<SPEAKER_ID>U<UTTERANCE_ID>"
305 4. Make FILE_PATH "<CSV_DATA_ROOT>/<WAV_PATH>"
306 5. Write an output file where each line has the structure:
307 <FILE_ID><Tab_space><FILE_PATH>
309 with open(self.
csv_path,
'r')
as csv_file:
310 csv_reader = csv.reader(csv_file, delimiter=self.
csv_delimiter)
311 row = next(csv_reader)
317 row = next(csv_reader)
319 sid = row[self.
index.speaker_id]
320 uid = row[self.
index.utterance_id]
321 wav_rel_path = row[self.
index.wav_path]
323 fid = sid +
'U' + str(uid).zfill(5)
326 out += fid +
'\t' + fpath +
'\n'
335 """utt2spk prepares the file 'utt2spk' in the DATASET directory.
337 The following flow has been established:
339 2. From each row extract:
342 3. Make FILE_ID "<SPEAKER_ID>U<UTTERANCE_ID>"
343 4. Write an output file where each line has the structure:
344 <FILE_ID><Tab_space><SPEAKER_ID>
346 with open(self.
csv_path,
'r')
as csv_file:
347 csv_reader = csv.reader(csv_file, delimiter=self.
csv_delimiter)
348 row = next(csv_reader)
354 row = next(csv_reader)
356 sid = row[self.
index.speaker_id]
357 uid = row[self.
index.utterance_id]
359 fid = sid +
'U' + str(uid).zfill(5)
361 out += fid +
'\t' + sid +
'\n'
370 """spk2utt prepares the file 'spk2utt' in the DATASET directory.
372 The following flow has been established:
373 1. Read 'utt2spk' from the DATASET directory (if missing, create it)
374 2. From each row extract:
377 3. Write an output file where each line has the structure:
378 <SPEAKER_ID> <FILE_ID_1> <FILE_ID_2> ... <FILE_ID_END>
388 row = next(utt2spk).split()
391 out += sid +
' ' + fid
397 out +=
'\n' + sid +
' ' + fid
402 row = next(utt2spk).split()
418 """When script is executed as __main__:
421 param1 (str): Absolute path of the CSV metadata file
422 param2 (str): Absolute path of the root directory of Speech Recognition System
423 param3 (str): Type of Dataset
425 if not len(sys.argv) == 4:
426 sys.exit(
"Not enough input arguments")
429 csv_path = sys.argv[1]
430 srs_path = sys.argv[2]
431 dataset = sys.argv[3]
436 if __name__ ==
'__main__':