The unified diff between revisions [22f61b7a..] and [e134775a..] is displayed below. It can also be downloaded as a raw diff.

#
#
# add_file "potd/potd2.py"
#  content [ec45564cd6d6c074f7c95e286f0ff9b10ed810f9]
#
#   set "potd/potd2.py"
#  attr "mtn:execute"
# value "true"
#
============================================================
--- potd/potd2.py	ec45564cd6d6c074f7c95e286f0ff9b10ed810f9
+++ potd/potd2.py	ec45564cd6d6c074f7c95e286f0ff9b10ed810f9
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+import libxml2
+import os, sha, pickle, urllib, heapq, sys, urllib2, urlparse
+import traceback
+
+wikipedia = 'http://en.wikipedia.org'
+picture_uri = wikipedia + '/w/index.php?title=Category:Wikipedia_featured_pictures'
+cache_path = './cache/'
+image_path = './images/'
+meta_path = './meta/'
+tmp_path = './tmp/'
+
+def cache_result():
+    def decorate(f):
+        def new_f(*args, **kwds):
+            arg_hash = sha.new()
+            arg_hash.update(pickle.dumps(args))
+            arg_hash.update(pickle.dumps(kwds))
+            cache_file = os.path.join(cache_path, urllib.quote(f.func_name) + '.' + arg_hash.hexdigest())
+            try:
+                return pickle.load(open(cache_file))
+            except:
+                rv = f(*args, **kwds)
+                pickle.dump(rv, open(cache_file, 'w'))
+                return rv
+        new_f.func_name = f.func_name
+        return new_f
+    return decorate
+
+def get_property(node, property_name):
+    prop = node.get_properties()
+    while prop:
+        if prop.get_name() == property_name:
+            return prop.get_content()
+        prop = prop.next
+
+def retrieve_uri(uri):
+    sys.stdout.write("retrieving: %s\n" % uri)
+    req = urllib2.Request(uri)
+    req.add_header('User-Agent', 'potd.py')
+    return urllib2.urlopen(req)
+
+def retrieve_uri_data(uri):
+    rv = []
+    fd = retrieve_uri(uri)
+    while True:
+        data = fd.read(8192)
+        if data == '': break
+        rv.append(data)
+    return ''.join(rv)
+
+@cache_result()
+def get_image_and_metadata(uri):
+    data = retrieve_uri_data(uri)
+    doc = libxml2.htmlParseDoc(data, 'utf-8')
+#    table = doc.xpathEval('//table[tr/th/text()="Description"]')[0]
+    imguris = doc.xpathEval('//table[@class="filehistory"]/tr/td/a')
+    if len(imguris) == 0:
+	    raise Exception("couldn't find img uri!")
+    imguri = get_property(imguris[0], "href")
+    imgname = urlparse.urlparse(imguri)[2].split('/')[-1]
+    tmp, out = map(lambda x: os.path.join(x, imgname),
+                   [tmp_path, image_path])
+    tmp_html, out_html = map(lambda x: os.path.join(x, imgname.rsplit('.', 1)[0] + '.html'),
+                             [tmp_path, meta_path])
+    open(tmp_html, 'w').write(data)
+    fd = open(tmp, 'w')
+    data_fd = retrieve_uri(imguri)
+    while True:
+        data = data_fd.read(8192)
+        if data == '': break
+        fd.write(data)
+    os.rename (tmp_html, out_html)
+    os.rename (tmp, out)
+
+@cache_result()
+def pictures_of_the_day():
+    heap = [picture_uri]
+    rv = []
+    while heap:
+        uri = heapq.heappop(heap)
+        next_uri = None
+        data = retrieve_uri_data(uri)
+        doc = libxml2.htmlParseDoc(data, 'utf-8')
+        for node in doc.xpathEval('//div[@class="gallerybox"]//a'):
+            rv.append(wikipedia + get_property(node, 'href'))
+        next_uris = doc.xpathEval('//a[text()="next 200"]')
+        if len(next_uris) > 1:
+            next_uri = wikipedia + get_property(next_uris[0], 'href')
+        if next_uri is not None:
+            heapq.heappush(heap, next_uri)
+    return rv
+
+if __name__ == '__main__':
+    pics = pictures_of_the_day()
+    print "We have", len(pics), "pictures of the day!"
+    for pic_uri in pics:
+        try:
+            get_image_and_metadata(pic_uri)
+        except:
+            print traceback.format_exc()
+#    print libxml2.htmlParseDoc(open('test2.xml').read(), 'utf-8').xpathEval('//table[tr/th/text()="Description"]')
+#   print libxml2.htmlParseDoc(open('test.xml').read(), 'utf-8').xpathEval('//a[text()="next 200"]')