Below is the file 'gcide.py' from this revision. You can also download the file.
#!/usr/bin/env python # -*- coding: utf-8 -*- import libxml2 import sys import os import re dictionary_path = os.path.join('data', 'gcide') largefile = os.path.join(dictionary_path, 'largefile.xml') def produce_large_file(): if os.access(largefile, os.R_OK): return fd = open(largefile, 'w') entity_match = re.compile(r'.*&(gcide_[a-z]);') external_parsed_match = re.compile(r'.*SYSTEM "(gcide_[a-z])\.xml">') for line in open(os.path.join(dictionary_path, 'gcide.xml')): m = entity_match.match(line) if m: subdict = m.groups()[0] fd.write(open(os.path.join(dictionary_path, subdict+'.xml')).read()) continue m = external_parsed_match.match(line) if m: # eat these continue # otherwise, write the line through fd.write(line) def words(): produce_large_file() doc = libxml2.parseFile(largefile) word_splitter = re.compile(r' |\t|-') accent_splitter = re.compile(r'[\*|\-|\"|\`]+') for hw in doc.xpathEval('//hw'): child = hw.get_children() if not child: continue val = child.get_content() # special case, seems to be a frequent problem in the doc if val == '‖': continue to_count = word_splitter.split(val) for val in (t.lower() for t in to_count): syllables = filter(None, accent_splitter.split(val)) # many words seem to have not been entered with syllable data # in this case, let's ignore them if len(syllables) < 2: continue word = Word(''.join(syllables)) word.syllables = len(syllables) yield word doc.freeDoc() if __name__ == '__main__': map(lambda w: sys.stdout.write(w + '\n'), words())