Below is the file 'generators.py' from this revision. You can also download the file.

#!/usr/bin/env python

import re
import web

#s_splitter = re.compile(r'[\.\!]+')
w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
trimmer = re.compile(r'([A-Za-z0-9\']+)')
ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)')
# source: http://www.w3.org/TR/html4/sgml/loosedtd.html#block
block_elements = set(["P", "DL", "DIV", "CENTER", "NOSCRIPT", "NOFRAMES", "BLOCKQUOTE", "FORM", "ISINDEX", "HR", "TABLE", "FIELDSET", "ADDRESS", "H1", "H2", "H3", "H4", "H5", "H6", "UL", "OL", "DIR", "MENU", "PRE", "BR"])

def tokenize(str):
    def remove_html(str):
        """replace any HTML with a single character of whitespace (in the case of a block-level element)
           or the empty string otherwise.
        """
        rv = ""
        current_tag = ""
        in_html = False
        for s in str:
            if s == "<": in_html = True
            if in_html: current_tag += s
            else: rv += s
            if s == ">":
                element = current_tag.lstrip(' <').split(' ', 1)[0].upper()
                # fixme, this is too site-specific; let's do something smarter here
                user = ljuser.match(current_tag)
                if user: rv += " " + user.groups()[0]
                if element in block_elements:
                    rv += " "
                in_html = False
                current_tag = ""
        return rv
    for line in str.split("\n"):
#        web.debug("** before html stripping")
#        web.debug(line)
        line = remove_html(line)
#        web.debug("** line")
#        web.debug(line)
        words = w_splitter.split(line)
        words = map(trimmer.match, words)
        words = filter(None, words)
        words = map(lambda x: x.groups()[0].lower(), words)
#        web.debug("** words")
#        web.debug(words)
        for word in words:
            yield word