speech_recognition
kaldi_grammar.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 
3 # Make python 2/3 compatible
4 from __future__ import (absolute_import, division,
5  print_function, unicode_literals)
6 from builtins import *
7 
8 import os
9 import shutil
10 from grammar_parser.cfgparser import CFGParser
11 from graphviz import render
12 
13 
14 class Grammar:
15  """
16  Class Grammar uses as input a grammar file with extension '.fcfg' and has two functions:
17  get_rule_element: extracts the defined grammar rules
18  get_words: extracts the unique words and creates 'corpus.txt' which is used to build 'G.fst'
19  """
20  def __init__(self, model_path, grammar_file_string, target):
21 
22  self.model_path = model_path
23  self.model_path_tmp = os.path.join(self.model_path, "tmp")
24 
25  # If model_path exists, create a tmp directory in it
26  if not os.path.exists(self.model_path):
27  raise Exception("Model path '{}' does not exist".format(self.model_path))
28  else:
29  if os.path.exists(self.model_path_tmp):
30  shutil.rmtree(self.model_path_tmp)
31 
32  os.mkdir(self.model_path_tmp)
33 
34  # Check if the grammar is a file or string and parse it
35  if os.path.exists(grammar_file_string):
36  self.parser = CFGParser.fromfile(grammar_file_string)
37  self.grammar_file = grammar_file_string
38  self.grammar_string = None
39  else:
40  self.parser = CFGParser.fromstring(grammar_file_string)
41  self.grammar_file = None
42  self.grammar_string = grammar_file_string
43 
44  self.target = target
45 
46  # Execute the following in the constructor
47  self.get_words_()
48  self.tree = self.expand_tree_()
49 
50  def get_words_(self):
51  """
52  Extracts list with all the unique words, used within the grammar and
53  create file 'corpus.txt' which is used to build 'G.fst'
54  """
55 
56  # Extract rules from the grammar file
57  rules = self.parser.rules
58 
59  # Extract words
60  words = set()
61 
62  for key, value in rules.iteritems():
63  # Get the list of options for the rule value
64  options = value.options
65  for option in options:
66  # Get the list of conjuncts for option 'option'
67  conjuncts = option.conjuncts
68  for conjunct in conjuncts:
69  # If conjunct is not a variable put its value in the set of words
70  if not conjunct.is_variable:
71  words.add(conjunct.name)
72 
73  words = [word.upper() for word in list(words)]
74  words.sort()
75 
76  # Create corpus.txt file and save the words list
77  corpus_path = os.path.join(self.model_path_tmp, "corpus.txt")
78  with open(corpus_path, "w") as f:
79  for word in words:
80  f.write(word + "\n")
81 
82  def autocomplete(self):
83  """
84  # TODO: expand the full tree, not only the first words
85  # replace raw_input with the speech recognition output
86  # add an option to skip a word if it is not a match and to check the
87  # next word
88  """
89  recognised_sentence = []
90 
91  recognition = raw_input("Recognised word: ")
92  type(recognition)
93 
94  # create a filtered list, based on the recognised first word
95  initial_list, recognised = self.check_word(recognition)
96  if not recognised:
97  print('Not a match')
98  else:
99  # remove the first word from each line
100  first_word = [line.pop(0) for line in initial_list]
101  recognised_sentence.append(first_word[0])
102  sentence_list = initial_list
103 
104  print('Initial filtered list: \n')
105  self.print_nicely(sentence_list)
106 
107  while len(sentence_list[0]) > 0:
108  next_recognition = raw_input("Next recognised word: ")
109  type(next_recognition)
110 
111  # create a filtered list, based on the next recognised word
112  new_initial_list, recognised = self.check_word(next_recognition, sentence_list)
113  if not recognised:
114  print('Not a match')
115  break
116  else:
117  # remove the first word from each line
118  next_word = [line.pop(0) for line in new_initial_list]
119  recognised_sentence.append(next_word[0])
120  sentence_list = new_initial_list
121 
122  print('New filtered list: \n')
123  self.print_nicely(sentence_list)
124 
125  print('Recognised sentence: \n' + str(recognised_sentence))
126  return recognised_sentence
127 
128  def check_word(self, recognition='', initial_list=[]):
129  """
130  Checks if the recognised word is matching with the first element in the expanded sentences
131  As output it keeps a list of only the sentences, starting with the recognised word.
132 
133  :param recognition: the recognised word
134  :param initial_list: bla
135  :return filtered_list: sentence list, filtered by its first word
136  """
137 
138  recognised = False
139 
140  if len(initial_list) == 0:
141  initial_list = self.expand_tree()
142 
143  filtered_list = []
144  for sentence in initial_list:
145  line = [item.name for item in sentence]
146  if line[0] == recognition:
147  filtered_list.append(line)
148  continue
149 
150  else:
151  filtered_list = []
152  for sentence in initial_list:
153  line = [item for item in sentence]
154  if line[0] == recognition:
155  filtered_list.append(line)
156  continue
157 
158  if len(filtered_list) > 0:
159  recognised = True
160 
161  print('Filtered list: \n')
162  print(recognised)
163  self.print_nicely(filtered_list)
164  return filtered_list, recognised
165 
166  def print_nicely(self, sentence_list):
167  """
168  Prints cleanly the output of the tree traversal functions
169 
170  :param sentence_list: list of possible completions
171  """
172  for sentence in sentence_list:
173  line = [item for item in sentence]
174  print(" ".join(line))
175  print('')
176 
177  def expand_tree_(self):
178  """
179  Expands the grammar tree based on the words in the grammar rules for the
180  pre-set target
181 
182  :return: tree of sentence nodes
183  """
184  # Extract rules from the grammar file
185  rules = self.parser.rules
186  return expand_tree(rules, self.target)
187 
188  def parse(self, sentence):
189  """
190  Parses the input sentence to generate the semantics for the pre-set
191  target
192 
193  :param sentence: The sentence to be parsed
194  :return: semantics
195  """
196  semantics = self.parser.parse(self.target, sentence)
197  return semantics
198 
199  def print_graphviz(self):
200  """
201  Wrapper around the print_graphviz function to print the current tree
202  """
204 
205 
207  """
208  A node in a sentence.
209  :ivar edges: Edges to the next node.
210  :ivar done: Reached the end of the sentence.
211  """
212  def __init__(self):
213  self.edges = []
214  self.done = False
215 
216 
218  """
219  An edge in a sentence.
220  :ivar word: The word to be understood.
221  :ivar node: Node for the remainder of the sentence.
222  """
223  def __init__(self, word, node):
224  self.word = word
225  self.node = node
226 
227 
228 def expand_tree(rules, target='T'):
229  """
230  Expands the grammar tree based on the words in the grammar rules.
231 
232  :param rules: Extracted rules from the grammar file.
233  :param target: Target rule to expand, default is 'T'.
234  :return: The root of the expanded tree.
235  :rtype: SentenceNode
236  """
237  # Map of set of successor rules to nodes.
238  available_nodes = {}
239 
240  # Pairs of node and rule suffixes that need further work.
241  work_list = []
242 
243  # Construct the initial node and the first set of suffix rules to expand further.
244  root_list = [opt.conjuncts[:] for opt in rules[target].options]
245  root_node = assign_node(root_list, available_nodes, work_list, rules)
246  while work_list:
247  node, expanded_list = work_list.pop()
248 
249  # collects alternatives on common prefixes and stores successor sentences
250  prefix_dict = {}
251  for item in expanded_list:
252  successors = prefix_dict.get(item[0].name)
253  if successors:
254  # Store the expanded successor sentence in existing entry.
255  successors.append(item[1:])
256  else:
257  # Store the expanded successor sentence found a non-existing prefix.
258  prefix_dict[item[0].name] = [item[1:]]
259 
260  # Iterate over the collected prefixes and make a new edge for the words.
261  for word, successors in prefix_dict.items():
262  # Find the node to jump to after recognizing 'word'.
263  nextnode = assign_node(successors, available_nodes, work_list, rules)
264  edge = SentenceEdge(word, nextnode)
265  node.edges.append(edge)
266 
267  return root_node
268 
269 
270 def expand_sentences(sentence_list, rules):
271  """
272  Expands the grammar rules until elimination of all variables at the first position
273 
274  :param sentence_list: List of grammar rules
275  :param rules: Rules of the grammar
276  :return: Expanded list, an whether an end of an sentence was found.
277  """
278  end_found = False
279  while sentence_list:
280  # decide if we need to expand anything
281  not_expanded = False
282  for item in sentence_list:
283  # Need to remove all empty alternatives.
284  if not item:
285  not_expanded = True
286  end_found = True
287  continue
288 
289  # Found an alternative, that needs further expansion.
290  if item[0].is_variable:
291  not_expanded = True
292 
293  # All first enries are words already, done!
294  if not not_expanded:
295  break
296 
297  # Expand variables at the first entry.
298  expanded_list = []
299  for item in sentence_list:
300  if not item:
301  continue
302  if not item[0].is_variable:
303  expanded_list.append(item)
304  continue
305 
306  for opt in rules[item[0].name].options:
307  d = opt.conjuncts + item[1:]
308  expanded_list.append(d)
309 
310  sentence_list = expanded_list
311 
312  return end_found, sentence_list
313 
314 
315 def stringify_suffixes(expanded_list):
316  """
317  Convert the current rule suffixes to string form.
318 
319  :param expanded_list: List of rule suffixes to convert.
320  :return: Set of suffixes, after converting each to a string.
321  """
322  sentence_set = set()
323  for sentence in expanded_list:
324  sentence_text = " ".join(conjunct.name for conjunct in sentence)
325  sentence_set.add(sentence_text)
326  return sentence_set
327 
328 
329 def assign_node(sentence_list, available_nodes, work_list, rules):
330  """
331  For a given list of rule suffixes, find or add a node, and update the work list if necessary.
332 
333  :param sentence_list: List of rule suffixes to find or add a node for.
334  :type sentence_list: List of rule alternatives (a list of conjuncts, partly expanded to words,
335  in particular, the first conjuct shou d not be a variable).
336 
337  :param available_nodes: Known set of rule sufixes and their associated nodes. May be updated.
338  :type available_nodes: Dict of str to SentenceNode
339 
340  :param work_list: List or rule suffixes that need further processing. May be updated.
341  :type work_list: List of pairs (node, rule suffixes).
342 
343  :param rules: Rules of the grammar.
344 
345  :return: Node associated with the provided sentence_list.
346  """
347  end_found, sentence_list = expand_sentences(sentence_list, rules)
348  sentence_set = stringify_suffixes(sentence_list)
349  sentence_set = frozenset(sentence_set)
350  node = available_nodes.get(sentence_set)
351  if node is None:
352  node = SentenceNode()
353  node.done = end_found
354  available_nodes[sentence_set] = node
355 
356  non_empty_sentences = []
357  for sentence in sentence_list:
358  if sentence:
359  non_empty_sentences.append(sentence)
360  else:
361  node.done = True
362 
363  work_list.append((node, non_empty_sentences))
364  return node
365 
366 
367 def print_graphviz(root_node, outpath):
368  """
369  Prints Graphviz input of the tree.
370 
371  :param root_node: Root of the tree
372  """
373 
374  work_list = [root_node]
375  node_numbers = {}
376  printed_numbers = set()
377  next_free_number = 1
378 
379  graphviz_dotfile_string = "digraph G {\n"
380 
381  while work_list:
382  node = work_list.pop()
383  number = node_numbers.get(node)
384  if not number:
385  node_numbers[node] = next_free_number
386  number = next_free_number
387  next_free_number += 1
388  else:
389  if number in printed_numbers:
390  continue
391 
392  # Print the node.
393  if node.done:
394  shape = "box"
395  else:
396  shape = "ellipse"
397  node_text = "node{}".format(number)
398  printed_numbers.add(number)
399  graphviz_dotfile_string += "{} [shape={}];".format(node_text, shape) \
400  + "\n"
401 
402  # Print its edges.
403  for edge in node.edges:
404  number = node_numbers.get(edge.node)
405  if not number:
406  node_numbers[edge.node] = next_free_number
407  number = next_free_number
408  next_free_number += 1
409  dest_text = "node{}".format(number)
410  work_list.append(edge.node)
411  graphviz_dotfile_string += "{} -> {} [label={}];".format(node_text,
412  dest_text, edge.word) + "\n"
413 
414  graphviz_dotfile_string += "}"
415 
416  # Print and render the graphviz file at the output location
417  dotfile_path = os.path.join(outpath, "grammar_tree.dot")
418  with open(dotfile_path, 'w') as f:
419  f.write(graphviz_dotfile_string)
420 
421  # Function call to graphviz.render
422  render("dot", "pdf", dotfile_path)
speech_recognition.kaldi_grammar.Grammar
Definition: kaldi_grammar.py:14
speech_recognition.kaldi_grammar.SentenceNode.__init__
def __init__(self)
Definition: kaldi_grammar.py:212
speech_recognition.kaldi_grammar.Grammar.grammar_file
grammar_file
Definition: kaldi_grammar.py:37
speech_recognition.kaldi_grammar.expand_tree
def expand_tree(rules, target='T')
Definition: kaldi_grammar.py:228
speech_recognition.kaldi_grammar.Grammar.parser
parser
Definition: kaldi_grammar.py:36
speech_recognition.kaldi_grammar.SentenceNode
Definition: kaldi_grammar.py:206
speech_recognition.kaldi_grammar.Grammar.autocomplete
def autocomplete(self)
Definition: kaldi_grammar.py:82
speech_recognition.kaldi_grammar.Grammar.print_graphviz
def print_graphviz(self)
Definition: kaldi_grammar.py:199
speech_recognition.kaldi_grammar.Grammar.tree
tree
Definition: kaldi_grammar.py:48
speech_recognition.kaldi_grammar.SentenceNode.done
done
Definition: kaldi_grammar.py:214
speech_recognition.kaldi_grammar.SentenceEdge
Definition: kaldi_grammar.py:217
speech_recognition.kaldi_grammar.stringify_suffixes
def stringify_suffixes(expanded_list)
Definition: kaldi_grammar.py:315
speech_recognition.kaldi_grammar.print_graphviz
def print_graphviz(root_node, outpath)
Definition: kaldi_grammar.py:367
speech_recognition.kaldi_grammar.Grammar.expand_tree_
def expand_tree_(self)
Definition: kaldi_grammar.py:177
speech_recognition.kaldi_grammar.Grammar.__init__
def __init__(self, model_path, grammar_file_string, target)
Definition: kaldi_grammar.py:20
speech_recognition.kaldi_grammar.SentenceEdge.word
word
Definition: kaldi_grammar.py:224
speech_recognition.kaldi_grammar.SentenceEdge.__init__
def __init__(self, word, node)
Definition: kaldi_grammar.py:223
speech_recognition.kaldi_grammar.SentenceEdge.node
node
Definition: kaldi_grammar.py:225
speech_recognition.kaldi_grammar.assign_node
def assign_node(sentence_list, available_nodes, work_list, rules)
Definition: kaldi_grammar.py:329
speech_recognition.kaldi_grammar.SentenceNode.edges
edges
Definition: kaldi_grammar.py:213
speech_recognition.kaldi_grammar.Grammar.grammar_string
grammar_string
Definition: kaldi_grammar.py:38
speech_recognition.kaldi_grammar.Grammar.model_path
model_path
Definition: kaldi_grammar.py:22
speech_recognition.kaldi_grammar.expand_sentences
def expand_sentences(sentence_list, rules)
Definition: kaldi_grammar.py:270
speech_recognition.kaldi_grammar.Grammar.model_path_tmp
model_path_tmp
Definition: kaldi_grammar.py:23
speech_recognition.kaldi_grammar.Grammar.target
target
Definition: kaldi_grammar.py:44
speech_recognition.kaldi_grammar.Grammar.check_word
def check_word(self, recognition='', initial_list=[])
Definition: kaldi_grammar.py:128
speech_recognition.kaldi_grammar.Grammar.print_nicely
def print_nicely(self, sentence_list)
Definition: kaldi_grammar.py:166
speech_recognition.kaldi_grammar.Grammar.parse
def parse(self, sentence)
Definition: kaldi_grammar.py:188
speech_recognition.kaldi_grammar.Grammar.get_words_
def get_words_(self)
Definition: kaldi_grammar.py:50