Below is the file 'generators.py' from this revision. You can also download the file.
#!/usr/bin/env python import re import web #s_splitter = re.compile(r'[\.\!]+') w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+') trimmer = re.compile(r'([A-Za-z0-9\']+)') ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)') # source: http://www.w3.org/TR/html4/sgml/loosedtd.html#block block_elements = set(["P", "DL", "DIV", "CENTER", "NOSCRIPT", "NOFRAMES", "BLOCKQUOTE", "FORM", "ISINDEX", "HR", "TABLE", "FIELDSET", "ADDRESS", "H1", "H2", "H3", "H4", "H5", "H6", "UL", "OL", "DIR", "MENU", "PRE", "BR"]) def tokenize(str): def remove_html(str): """replace any HTML with a single character of whitespace (in the case of a block-level element) or the empty string otherwise. """ rv = "" current_tag = "" in_html = False for s in str: if s == "<": in_html = True if in_html: current_tag += s else: rv += s if s == ">": element = current_tag.lstrip(' <').split(' ', 1)[0].upper() # fixme, this is too site-specific; let's do something smarter here user = ljuser.match(current_tag) if user: rv += " " + user.groups()[0] if element in block_elements: rv += " " in_html = False current_tag = "" return rv for line in str.split("\n"): # web.debug("** before html stripping") # web.debug(line) line = remove_html(line) # web.debug("** line") # web.debug(line) words = w_splitter.split(line) words = map(trimmer.match, words) words = filter(None, words) words = map(lambda x: x.groups()[0].lower(), words) # web.debug("** words") # web.debug(words) for word in words: yield word