Below is the file 'generators.py' from this revision. You can also download the file.

#!/usr/bin/env python

import re
import sha
import socket
import libxml2

s_splitter = re.compile(r'[\.\!]+')
w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
trimmer = re.compile(r'([A-Za-z0-9\']+)')
ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)')

def tokenize(str):
	def remove_html(str):
		"replace any HTML with a single character of whitespace"
		rv = ""
		current_tag = ""
		in_html = False
		for s in str:
			if s == "<": in_html = True
			if in_html: current_tag += s
			else: rv += s
			if s == ">":
				user = ljuser.match(current_tag)
				if user: rv += " " + user.groups()[0]
				in_html = False
				current_tag = ""
				rv += " "
		return rv
	rv = []
	for line in str.split("\n"):
#		print "Before:", line
		line = remove_html(line)
#		print "Line:", line
		sentences = s_splitter.split(line)
		for sentence in sentences:
			words = w_splitter.split(sentence)
#			print "sentence is:", sentence
#			print "words are:", words
			words = map(trimmer.match, words)
			words = filter(None, words)
			words = map(lambda x: x.groups()[0].lower(), words)
			rv.append(words)
#		tokens = splitter.split(line)
#		for token in tokens:
#			trimmed = trimmer.match(token)
#			if trimmed: rv.append(trimmed.groups()[0].lower())
	return rv

class Callback:
	def __init__(self, queue, article_ids):
		self.queue = queue
		self.article_ids = article_ids
	def startDocument(self):
		self.gather_chars = False
		self.in_item = False
		self.in_guid = False
		self.chars, self.guid = "", ""
	def characters(self, data):
		if self.gather_chars: self.chars += data
		elif self.in_guid: self.guid += data
	def endDocument(self): pass
	def startElement(self, tag, attrs):
		if tag == "item": self.in_item = True
		if tag == "guid":
			self.guid = ""
			self.in_guid = True
		if self.in_item and (tag == "description" or tag == "title"):
			self.gather_chars = True
	def endElement(self, tag):
		if tag == "item":
			self.in_item = False
			self.article_ids.add(self.current_guid)
		if tag == "guid":
			self.in_guid = False
			self.current_guid = sha.new(self.guid).hexdigest()
		if self.in_item and self.gather_chars and (tag == "description" or tag == "title"):
			if self.current_guid not in self.article_ids:
				candidate = tokenize(self.chars)
				if len(candidate) > 0:
					self.queue += candidate
			self.chars = ""
			self.gather_chars = False

def RSSGenerator(data_generator, article_ids=[]):
	queue = []
	callback = Callback(queue, article_ids)
	ctxt = libxml2.createPushParser(callback, "", 0, "")
	for data in data_generator:
		ctxt.parseChunk(data, len(data), 0)
		queue.reverse()
		while len(queue):
			yield queue.pop()
	ctxt.parseChunk("", 0, 1)
	queue.reverse()
	while len(queue):
		yield queue.pop()

def HTTPGenerator(host, uri):
	fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	fd.connect((host, 80))
	fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))
	in_headers = True
	while 1:
		data = fd.recv(1024)
		if data == "": break
		if in_headers:
			off = data.find("\r\n\r\n")
			if off <> -1:
				in_headers = False
				data = data[off+4:]
		if not in_headers:
			yield data