Below is the file 'potd/potd2.py' from this revision. You can also download the file.

#!/usr/bin/env python

import libxml2
import os, sha, pickle, urllib, heapq, sys, urllib2, urlparse
import traceback

wikipedia = 'http://en.wikipedia.org'
picture_uri = wikipedia + '/w/index.php?title=Category:Wikipedia_featured_pictures'
cache_path = './cache/'
image_path = './images/'
meta_path = './meta/'
tmp_path = './tmp/'

def cache_result():
    def decorate(f):
        def new_f(*args, **kwds):
            arg_hash = sha.new()
            arg_hash.update(pickle.dumps(args))
            arg_hash.update(pickle.dumps(kwds))
            cache_file = os.path.join(cache_path, urllib.quote(f.func_name) + '.' + arg_hash.hexdigest())
            try:
                return pickle.load(open(cache_file))
            except:
                rv = f(*args, **kwds)
                pickle.dump(rv, open(cache_file, 'w'))
                return rv
        new_f.func_name = f.func_name
        return new_f
    return decorate

def get_property(node, property_name):
    prop = node.get_properties()
    while prop:
        if prop.get_name() == property_name:
            return prop.get_content()
        prop = prop.next

def retrieve_uri(uri):
    sys.stdout.write("retrieving: %s\n" % uri)
    req = urllib2.Request(uri)
    req.add_header('User-Agent', 'potd.py')
    return urllib2.urlopen(req)

def retrieve_uri_data(uri):
    rv = []
    fd = retrieve_uri(uri)
    while True:
        data = fd.read(8192)
        if data == '': break
        rv.append(data)
    return ''.join(rv)

@cache_result()
def get_image_and_metadata(uri):
    data = retrieve_uri_data(uri)
    doc = libxml2.htmlParseDoc(data, 'utf-8')
#    table = doc.xpathEval('//table[tr/th/text()="Description"]')[0]
    imguris = doc.xpathEval('//table[@class="filehistory"]/tr/td/a')
    if len(imguris) == 0:
	    raise Exception("couldn't find img uri!")
    imguri = get_property(imguris[0], "href")
    imgname = urlparse.urlparse(imguri)[2].split('/')[-1]
    tmp, out = map(lambda x: os.path.join(x, imgname),
                   [tmp_path, image_path])
    tmp_html, out_html = map(lambda x: os.path.join(x, imgname.rsplit('.', 1)[0] + '.html'),
                             [tmp_path, meta_path])
    open(tmp_html, 'w').write(data)
    fd = open(tmp, 'w')
    data_fd = retrieve_uri(imguri)
    while True:
        data = data_fd.read(8192)
        if data == '': break
        fd.write(data)
    os.rename (tmp_html, out_html)
    os.rename (tmp, out)

@cache_result()
def pictures_of_the_day():
    heap = [picture_uri]
    rv = []
    while heap:
        uri = heapq.heappop(heap)
        next_uri = None
        data = retrieve_uri_data(uri)
        doc = libxml2.htmlParseDoc(data, 'utf-8')
        for node in doc.xpathEval('//div[@class="gallerybox"]//a'):
            rv.append(wikipedia + get_property(node, 'href'))
        next_uris = doc.xpathEval('//a[text()="next 200"]')
        if len(next_uris) > 1:
            next_uri = wikipedia + get_property(next_uris[0], 'href')
        if next_uri is not None:
            heapq.heappush(heap, next_uri)
    return rv

if __name__ == '__main__':
    pics = pictures_of_the_day()
    print "We have", len(pics), "pictures of the day!"
    for pic_uri in pics:
        try:
            get_image_and_metadata(pic_uri)
        except:
            print traceback.format_exc()
#    print libxml2.htmlParseDoc(open('test2.xml').read(), 'utf-8').xpathEval('//table[tr/th/text()="Description"]')
#   print libxml2.htmlParseDoc(open('test.xml').read(), 'utf-8').xpathEval('//a[text()="next 200"]')