Below is the file 'generators.py' from this revision. You can also download the file.
#!/usr/bin/env python import re import sha import socket import libxml2 import urlparse s_splitter = re.compile(r'[\.\!]+') w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+') trimmer = re.compile(r'([A-Za-z0-9\']+)') ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)') def tokenize(str): def remove_html(str): "replace any HTML with a single character of whitespace" rv = "" current_tag = "" in_html = False for s in str: if s == "<": in_html = True if in_html: current_tag += s else: rv += s if s == ">": user = ljuser.match(current_tag) if user: rv += " " + user.groups()[0] in_html = False current_tag = "" rv += " " return rv rv = [] for line in str.split("\n"): # print "Before:", line line = remove_html(line) # print "Line:", line sentences = s_splitter.split(line) for sentence in sentences: words = w_splitter.split(sentence) # print "sentence is:", sentence # print "words are:", words words = map(trimmer.match, words) words = filter(None, words) words = map(lambda x: x.groups()[0].lower(), words) rv.append(words) # tokens = splitter.split(line) # for token in tokens: # trimmed = trimmer.match(token) # if trimmed: rv.append(trimmed.groups()[0].lower()) return rv class Callback: def __init__(self, queue, article_ids, new_article_ids): self.queue = queue self.article_ids = article_ids self.new_article_ids = new_article_ids def startDocument(self): self.gather_chars = False self.in_item = False self.in_guid = False self.chars, self.guid = "", "" def characters(self, data): if self.gather_chars: self.chars += data elif self.in_guid: self.guid += data def endDocument(self): pass def startElement(self, tag, attrs): if tag == "item": self.in_item = True if tag == "guid": self.guid = "" self.in_guid = True if self.in_item and (tag == "description" or tag == "title"): self.gather_chars = True def endElement(self, tag): if tag == "item": self.in_item = False if self.current_guid and self.current_guid not in self.article_ids: self.new_article_ids.add(self.current_guid) if tag == "guid": self.in_guid = False self.current_guid = sha.new(self.guid).hexdigest() if self.in_item and self.gather_chars and (tag == "description" or tag == "title"): if self.current_guid not in self.article_ids: candidate = tokenize(self.chars) if len(candidate) > 0: self.queue += candidate self.chars = "" self.gather_chars = False def RSSGenerator(data_generator, article_ids, new_article_ids): queue = [] callback = Callback(queue, article_ids, new_article_ids) ctxt = libxml2.createPushParser(callback, "", 0, "") for data in data_generator: ctxt.parseChunk(data, len(data), 0) queue.reverse() while len(queue): yield queue.pop() ctxt.parseChunk("", 0, 1) queue.reverse() while len(queue): yield queue.pop() def HTTPGenerator(uri): host = urlparse.urlparse(uri)[1] fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM) fd.connect((host, 80)) fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host)) in_headers = True while 1: data = fd.recv(1024) if data == "": break if in_headers: off = data.find("\r\n\r\n") if off <> -1: in_headers = False data = data[off+4:] if not in_headers: yield data