Below is the file 'wiktionary.py' from this revision. You can also download the file.
#!/usr/bin/env python # -*- coding: utf-8 -*- import libxml2 import sys import os import re from ipa import contains_ipa_vowel dumpfile = os.path.join('data', 'wiktionary', 'pages_articles.xml') class Page(object): def __init__(self): self.title, self.contents = "", "" def pages(): to_yield = [] class Handler(object): def __init__(self): self.page = None def textToTitle(self, text): self.page.title += text def textToContents(self, text): self.page.contents += text def startElement(self, tag, attrs): if tag == "page": self.page = Page() elif tag == "title": self.characters = self.textToTitle elif tag == "text": self.characters = self.textToContents def endElement(self, tag): if tag == "page": to_yield.append(self.page) elif tag == "title" or tag == "text": del self.characters handler = libxml2.createPushParser(Handler(), '', 0, dumpfile) for line in open(dumpfile): if not line: break handler.parseChunk(line, len(line), False) if len(to_yield): for y in to_yield: yield y to_yield = [] handler.parseChunk('', 0, True) def words(): ipa_success = 0 non_dictionary_entry = re.compile(r'^[^ \t]+:') section_re = re.compile(r'^=== *(.*) *===') ipa_res = [ re.compile(r'.*[\[\|]IPA\]\]: \/?([^,$\/]+)\/?'), re.compile(r'.*\{\{IPA[char]*\|\/?([^\/]+)\/?'), re.compile(r'.*\/([^\/]+)\/') ] for page in pages(): ipa_for_word = [] if non_dictionary_entry.match(page.title): continue section = None for line in page.contents.splitlines(): m = section_re.match(line) if m != None: section = m.groups()[0].strip() continue if section == "Pronunciation": # attempt to extract IPA data for the word for match in (t.groups()[0] for t in filter(None, map(lambda r: r.match(line), ipa_res))): if match.startswith('[[') and match.endswith(']]'): idx = match.find('|') if idx != -1: match = match[idx+1:-2] if not contains_ipa_vowel(match): continue ipa_for_word.append(match) if ipa_for_word: print page.title, ipa_for_word ipa_success += 1 print "We have IPA for: %d" % ipa_success