The unified diff between revisions [22f61b7a..] and [e134775a..] is displayed below. It can also be downloaded as a raw diff.
#
#
# add_file "potd/potd2.py"
# content [ec45564cd6d6c074f7c95e286f0ff9b10ed810f9]
#
# set "potd/potd2.py"
# attr "mtn:execute"
# value "true"
#
============================================================
--- potd/potd2.py ec45564cd6d6c074f7c95e286f0ff9b10ed810f9
+++ potd/potd2.py ec45564cd6d6c074f7c95e286f0ff9b10ed810f9
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+
+import libxml2
+import os, sha, pickle, urllib, heapq, sys, urllib2, urlparse
+import traceback
+
+wikipedia = 'http://en.wikipedia.org'
+picture_uri = wikipedia + '/w/index.php?title=Category:Wikipedia_featured_pictures'
+cache_path = './cache/'
+image_path = './images/'
+meta_path = './meta/'
+tmp_path = './tmp/'
+
+def cache_result():
+ def decorate(f):
+ def new_f(*args, **kwds):
+ arg_hash = sha.new()
+ arg_hash.update(pickle.dumps(args))
+ arg_hash.update(pickle.dumps(kwds))
+ cache_file = os.path.join(cache_path, urllib.quote(f.func_name) + '.' + arg_hash.hexdigest())
+ try:
+ return pickle.load(open(cache_file))
+ except:
+ rv = f(*args, **kwds)
+ pickle.dump(rv, open(cache_file, 'w'))
+ return rv
+ new_f.func_name = f.func_name
+ return new_f
+ return decorate
+
+def get_property(node, property_name):
+ prop = node.get_properties()
+ while prop:
+ if prop.get_name() == property_name:
+ return prop.get_content()
+ prop = prop.next
+
+def retrieve_uri(uri):
+ sys.stdout.write("retrieving: %s\n" % uri)
+ req = urllib2.Request(uri)
+ req.add_header('User-Agent', 'potd.py')
+ return urllib2.urlopen(req)
+
+def retrieve_uri_data(uri):
+ rv = []
+ fd = retrieve_uri(uri)
+ while True:
+ data = fd.read(8192)
+ if data == '': break
+ rv.append(data)
+ return ''.join(rv)
+
+@cache_result()
+def get_image_and_metadata(uri):
+ data = retrieve_uri_data(uri)
+ doc = libxml2.htmlParseDoc(data, 'utf-8')
+# table = doc.xpathEval('//table[tr/th/text()="Description"]')[0]
+ imguris = doc.xpathEval('//table[@class="filehistory"]/tr/td/a')
+ if len(imguris) == 0:
+ raise Exception("couldn't find img uri!")
+ imguri = get_property(imguris[0], "href")
+ imgname = urlparse.urlparse(imguri)[2].split('/')[-1]
+ tmp, out = map(lambda x: os.path.join(x, imgname),
+ [tmp_path, image_path])
+ tmp_html, out_html = map(lambda x: os.path.join(x, imgname.rsplit('.', 1)[0] + '.html'),
+ [tmp_path, meta_path])
+ open(tmp_html, 'w').write(data)
+ fd = open(tmp, 'w')
+ data_fd = retrieve_uri(imguri)
+ while True:
+ data = data_fd.read(8192)
+ if data == '': break
+ fd.write(data)
+ os.rename (tmp_html, out_html)
+ os.rename (tmp, out)
+
+@cache_result()
+def pictures_of_the_day():
+ heap = [picture_uri]
+ rv = []
+ while heap:
+ uri = heapq.heappop(heap)
+ next_uri = None
+ data = retrieve_uri_data(uri)
+ doc = libxml2.htmlParseDoc(data, 'utf-8')
+ for node in doc.xpathEval('//div[@class="gallerybox"]//a'):
+ rv.append(wikipedia + get_property(node, 'href'))
+ next_uris = doc.xpathEval('//a[text()="next 200"]')
+ if len(next_uris) > 1:
+ next_uri = wikipedia + get_property(next_uris[0], 'href')
+ if next_uri is not None:
+ heapq.heappush(heap, next_uri)
+ return rv
+
+if __name__ == '__main__':
+ pics = pictures_of_the_day()
+ print "We have", len(pics), "pictures of the day!"
+ for pic_uri in pics:
+ try:
+ get_image_and_metadata(pic_uri)
+ except:
+ print traceback.format_exc()
+# print libxml2.htmlParseDoc(open('test2.xml').read(), 'utf-8').xpathEval('//table[tr/th/text()="Description"]')
+# print libxml2.htmlParseDoc(open('test.xml').read(), 'utf-8').xpathEval('//a[text()="next 200"]')