speech_recognition
generate_bigram.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 #------------------------------------------------------------------------------
4 # This program is used to generate the wp_gram from the district or commodity list
5 # eg ./generate_bigram.py unique_word_list > wp_gram
6 #------------------------------------------------------------------------------
7 
8 import sys
9 from collections import defaultdict
10 
11 word_list = [xx.strip().split() for xx in open(sys.argv[1])]
12 word_list = [ ["SENTENCE-END"] + xx + ["SENTENCE-END"]
13  for xx in word_list ]
14 
15 suc_list = defaultdict(set)
16 
17 for line in word_list:
18  for w1, w2 in zip(line[:-1], line[1:]):
19  suc_list[w1].add(w2)
20 
21 list_of_keys = suc_list.keys()
22 list_of_keys.sort()
23 
24 for ww in list_of_keys:
25  print ">" + ww
26  for ss in suc_list[ww]:
27  print " " + ss