Below is the file 'wiktionary.py' from this revision. You can also download the file.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import libxml2
import sys
import os
import re
from ipa import contains_ipa_vowel

dumpfile = os.path.join('data', 'wiktionary', 'pages_articles.xml')

class Page(object):
    def __init__(self):
	self.title, self.contents = "", ""

def pages():
    to_yield = []
    class Handler(object):
	def __init__(self):
	    self.page = None
	def textToTitle(self, text):
	    self.page.title += text
	def textToContents(self, text):
	    self.page.contents += text
	def startElement(self, tag, attrs):
	    if tag == "page":
		self.page = Page()
	    elif tag == "title":
		self.characters = self.textToTitle
	    elif tag == "text":
		self.characters = self.textToContents
	def endElement(self, tag):
	    if tag == "page":
		to_yield.append(self.page)
	    elif tag == "title" or tag == "text":
		del self.characters

    handler = libxml2.createPushParser(Handler(), '', 0, dumpfile)
    for line in open(dumpfile):
	if not line:
	    break
	handler.parseChunk(line, len(line), False)
	if len(to_yield):
	    for y in to_yield: yield y
	    to_yield = []
    handler.parseChunk('', 0, True)

def words():
    ipa_success = 0
    non_dictionary_entry = re.compile(r'^[^ \t]+:')
    section_re = re.compile(r'^=== *(.*) *===')
    ipa_res = [ re.compile(r'.*[\[\|]IPA\]\]: \/?([^,$\/]+)\/?'),
		re.compile(r'.*\{\{IPA[char]*\|\/?([^\/]+)\/?'),
		re.compile(r'.*\/([^\/]+)\/') ]
    for page in pages():
	ipa_for_word = []

	if non_dictionary_entry.match(page.title):
	    continue
	section = None
	for line in page.contents.splitlines():
	    m = section_re.match(line)
	    if m != None:
		section = m.groups()[0].strip()
		continue

	    if section == "Pronunciation":
		# attempt to extract IPA data for the word
		for match in (t.groups()[0] for t in
			      filter(None, map(lambda r: r.match(line), ipa_res))):
		    if match.startswith('[[') and match.endswith(']]'):
			idx = match.find('|')
			if idx != -1:
			    match = match[idx+1:-2]
		    if not contains_ipa_vowel(match):
			continue
		    ipa_for_word.append(match)
	if ipa_for_word:
	    print page.title, ipa_for_word
	    ipa_success += 1

    print "We have IPA for: %d" % ipa_success