The unified diff between revisions [8c5c8a3d..] and [e331aa42..] is displayed below. It can also be downloaded as a raw diff.
This diff has been restricted to the following files: 'generators.py'
#
#
# patch "generators.py"
# from [1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592]
# to [95ddf524e1ba9419c070b13870fb6427dbe198ec]
#
============================================================
--- generators.py 1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592
+++ generators.py 95ddf524e1ba9419c070b13870fb6427dbe198ec
@@ -1,119 +1,48 @@ import re
#!/usr/bin/env python
import re
-import sha
-import socket
-import libxml2
-import urlparse
+import web
-s_splitter = re.compile(r'[\.\!]+')
+#s_splitter = re.compile(r'[\.\!]+')
w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
trimmer = re.compile(r'([A-Za-z0-9\']+)')
ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)')
+# source: http://www.w3.org/TR/html4/sgml/loosedtd.html#block
+block_elements = set(["P", "DL", "DIV", "CENTER", "NOSCRIPT", "NOFRAMES", "BLOCKQUOTE", "FORM", "ISINDEX", "HR", "TABLE", "FIELDSET", "ADDRESS", "H1", "H2", "H3", "H4", "H5", "H6", "UL", "OL", "DIR", "MENU", "PRE", "BR"])
def tokenize(str):
+ def remove_html(str):
+ """replace any HTML with a single character of whitespace (in the case of a block-level element)
+ or the empty string otherwise.
+ """
+ rv = ""
+ current_tag = ""
+ in_html = False
+ for s in str:
+ if s == "<": in_html = True
+ if in_html: current_tag += s
+ else: rv += s
+ if s == ">":
+ element = current_tag.lstrip(' <').split(' ', 1)[0].upper()
+ # fixme, this is too site-specific; let's do something smarter here
+ user = ljuser.match(current_tag)
+ if user: rv += " " + user.groups()[0]
+ if element in block_elements:
+ rv += " "
+ in_html = False
+ current_tag = ""
+ return rv
+ for line in str.split("\n"):
+# web.debug("** before html stripping")
+# web.debug(line)
+ line = remove_html(line)
+# web.debug("** line")
+# web.debug(line)
+ words = w_splitter.split(line)
+ words = map(trimmer.match, words)
+ words = filter(None, words)
+ words = map(lambda x: x.groups()[0].lower(), words)
+# web.debug("** words")
+# web.debug(words)
+ for word in words:
+ yield word
- def remove_html(str):
- "replace any HTML with a single character of whitespace"
- rv = ""
- current_tag = ""
- in_html = False
- for s in str:
- if s == "<": in_html = True
- if in_html: current_tag += s
- else: rv += s
- if s == ">":
- user = ljuser.match(current_tag)
- if user: rv += " " + user.groups()[0]
- in_html = False
- current_tag = ""
- rv += " "
- return rv
- rv = []
- for line in str.split("\n"):
-# print "Before:", line
- line = remove_html(line)
-# print "Line:", line
- sentences = s_splitter.split(line)
- for sentence in sentences:
- words = w_splitter.split(sentence)
-# print "sentence is:", sentence
-# print "words are:", words
- words = map(trimmer.match, words)
- words = filter(None, words)
- words = map(lambda x: x.groups()[0].lower(), words)
- rv.append(words)
-# tokens = splitter.split(line)
-# for token in tokens:
-# trimmed = trimmer.match(token)
-# if trimmed: rv.append(trimmed.groups()[0].lower())
- return rv
-
-class Callback:
- def __init__(self, queue, article_ids, new_article_ids):
- self.queue = queue
- self.article_ids = article_ids
- self.new_article_ids = new_article_ids
- def startDocument(self):
- self.gather_chars = False
- self.in_item = False
- self.in_guid = False
- self.chars, self.guid = "", ""
- def characters(self, data):
- if self.gather_chars: self.chars += data
- elif self.in_guid: self.guid += data
- def endDocument(self):
- pass
- def startElement(self, tag, attrs):
- if tag == "item":
- self.in_item = True
- if tag == "guid":
- self.guid = ""
- self.in_guid = True
- if self.in_item and (tag == "description" or tag == "title"):
- self.gather_chars = True
- def endElement(self, tag):
- if tag == "item":
- self.in_item = False
- if self.current_guid and self.current_guid not in self.article_ids:
- self.new_article_ids.add(self.current_guid)
- if tag == "guid":
- self.in_guid = False
- self.current_guid = sha.new(self.guid).hexdigest()
- if self.in_item and self.gather_chars and (tag == "description" or tag == "title"):
- if self.current_guid not in self.article_ids:
- candidate = tokenize(self.chars)
- if len(candidate) > 0:
- self.queue += candidate
- self.chars = ""
- self.gather_chars = False
-
-def RSSGenerator(data_generator, article_ids, new_article_ids):
- queue = []
- callback = Callback(queue, article_ids, new_article_ids)
- ctxt = libxml2.createPushParser(callback, "", 0, "")
- for data in data_generator:
- ctxt.parseChunk(data, len(data), 0)
- queue.reverse()
- while len(queue):
- yield queue.pop()
- ctxt.parseChunk("", 0, 1)
- queue.reverse()
- while len(queue):
- yield queue.pop()
-
-def HTTPGenerator(uri):
- host = urlparse.urlparse(uri)[1]
- fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- fd.connect((host, 80))
- fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))
- in_headers = True
- while 1:
- data = fd.recv(1024)
- if data == "": break
- if in_headers:
- off = data.find("\r\n\r\n")
- if off <> -1:
- in_headers = False
- data = data[off+4:]
- if not in_headers:
- yield data