Below is the file 'gcide.py' from this revision. You can also download the file.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import libxml2
import sys
import os
import re

dictionary_path = os.path.join('data', 'gcide')
largefile = os.path.join(dictionary_path, 'largefile.xml')

def produce_large_file():
    if os.access(largefile, os.R_OK):
	return
    fd = open(largefile, 'w')
    entity_match = re.compile(r'.*&(gcide_[a-z]);')
    external_parsed_match = re.compile(r'.*SYSTEM "(gcide_[a-z])\.xml">')
    for line in open(os.path.join(dictionary_path, 'gcide.xml')):
	m = entity_match.match(line)
	if m:
	    subdict = m.groups()[0]
	    fd.write(open(os.path.join(dictionary_path, subdict+'.xml')).read())
	    continue
	m = external_parsed_match.match(line)
	if m:
	    # eat these
	    continue
	# otherwise, write the line through
	fd.write(line)

def words():
    produce_large_file()
    doc = libxml2.parseFile(largefile)
    word_splitter = re.compile(r' |\t|-')
    accent_splitter = re.compile(r'[\*|\-|\"|\`]+')
    for hw in doc.xpathEval('//hw'):
	child = hw.get_children()
	if not child: continue
	val = child.get_content()
	# special case, seems to be a frequent problem in the doc
	if val == '‖': continue
	to_count = word_splitter.split(val)

	for val in (t.lower() for t in to_count):
	    syllables = filter(None, accent_splitter.split(val))
	    # many words seem to have not been entered with syllable data
	    # in this case, let's ignore them
	    if len(syllables) < 2:
		continue
	    word = Word(''.join(syllables))
	    word.syllables = len(syllables)
	    yield word
    doc.freeDoc()

if __name__ == '__main__':
    map(lambda w: sys.stdout.write(w + '\n'), words())