Below is the file 'potd/potd2.py' from this revision. You can also download the file.
#!/usr/bin/env python import libxml2 import os, sha, pickle, urllib, heapq, sys, urllib2, urlparse import traceback wikipedia = 'http://en.wikipedia.org' picture_uri = wikipedia + '/w/index.php?title=Category:Wikipedia_featured_pictures' cache_path = './cache/' image_path = './images/' meta_path = './meta/' tmp_path = './tmp/' def cache_result(): def decorate(f): def new_f(*args, **kwds): arg_hash = sha.new() arg_hash.update(pickle.dumps(args)) arg_hash.update(pickle.dumps(kwds)) cache_file = os.path.join(cache_path, urllib.quote(f.func_name) + '.' + arg_hash.hexdigest()) try: return pickle.load(open(cache_file)) except: rv = f(*args, **kwds) pickle.dump(rv, open(cache_file, 'w')) return rv new_f.func_name = f.func_name return new_f return decorate def get_property(node, property_name): prop = node.get_properties() while prop: if prop.get_name() == property_name: return prop.get_content() prop = prop.next def retrieve_uri(uri): sys.stdout.write("retrieving: %s\n" % uri) req = urllib2.Request(uri) req.add_header('User-Agent', 'potd.py') return urllib2.urlopen(req) def retrieve_uri_data(uri): rv = [] fd = retrieve_uri(uri) while True: data = fd.read(8192) if data == '': break rv.append(data) return ''.join(rv) @cache_result() def get_image_and_metadata(uri): data = retrieve_uri_data(uri) doc = libxml2.htmlParseDoc(data, 'utf-8') # table = doc.xpathEval('//table[tr/th/text()="Description"]')[0] imguris = doc.xpathEval('//table[@class="filehistory"]/tr/td/a') if len(imguris) == 0: raise Exception("couldn't find img uri!") imguri = get_property(imguris[0], "href") imgname = urlparse.urlparse(imguri)[2].split('/')[-1] tmp, out = map(lambda x: os.path.join(x, imgname), [tmp_path, image_path]) tmp_html, out_html = map(lambda x: os.path.join(x, imgname.rsplit('.', 1)[0] + '.html'), [tmp_path, meta_path]) open(tmp_html, 'w').write(data) fd = open(tmp, 'w') data_fd = retrieve_uri(imguri) while True: data = data_fd.read(8192) if data == '': break fd.write(data) os.rename (tmp_html, out_html) os.rename (tmp, out) @cache_result() def pictures_of_the_day(): heap = [picture_uri] rv = [] while heap: uri = heapq.heappop(heap) next_uri = None data = retrieve_uri_data(uri) doc = libxml2.htmlParseDoc(data, 'utf-8') for node in doc.xpathEval('//div[@class="gallerybox"]//a'): rv.append(wikipedia + get_property(node, 'href')) next_uris = doc.xpathEval('//a[text()="next 200"]') if len(next_uris) > 1: next_uri = wikipedia + get_property(next_uris[0], 'href') if next_uri is not None: heapq.heappush(heap, next_uri) return rv if __name__ == '__main__': pics = pictures_of_the_day() print "We have", len(pics), "pictures of the day!" for pic_uri in pics: try: get_image_and_metadata(pic_uri) except: print traceback.format_exc() # print libxml2.htmlParseDoc(open('test2.xml').read(), 'utf-8').xpathEval('//table[tr/th/text()="Description"]') # print libxml2.htmlParseDoc(open('test.xml').read(), 'utf-8').xpathEval('//a[text()="next 200"]')