The unified diff between revisions [8c5c8a3d..] and [e331aa42..] is displayed below. It can also be downloaded as a raw diff.

This diff has been restricted to the following files: 'generators.py'

#
#
# patch "generators.py"
#  from [1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592]
#    to [95ddf524e1ba9419c070b13870fb6427dbe198ec]
#
============================================================
--- generators.py	1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592
+++ generators.py	95ddf524e1ba9419c070b13870fb6427dbe198ec
@@ -1,119 +1,48 @@ import re
 #!/usr/bin/env python

 import re
-import sha
-import socket
-import libxml2
-import urlparse
+import web

-s_splitter = re.compile(r'[\.\!]+')
+#s_splitter = re.compile(r'[\.\!]+')
 w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
 trimmer = re.compile(r'([A-Za-z0-9\']+)')
 ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)')
+# source: http://www.w3.org/TR/html4/sgml/loosedtd.html#block
+block_elements = set(["P", "DL", "DIV", "CENTER", "NOSCRIPT", "NOFRAMES", "BLOCKQUOTE", "FORM", "ISINDEX", "HR", "TABLE", "FIELDSET", "ADDRESS", "H1", "H2", "H3", "H4", "H5", "H6", "UL", "OL", "DIR", "MENU", "PRE", "BR"])

 def tokenize(str):
+    def remove_html(str):
+        """replace any HTML with a single character of whitespace (in the case of a block-level element)
+           or the empty string otherwise.
+        """
+        rv = ""
+        current_tag = ""
+        in_html = False
+        for s in str:
+            if s == "<": in_html = True
+            if in_html: current_tag += s
+            else: rv += s
+            if s == ">":
+                element = current_tag.lstrip(' <').split(' ', 1)[0].upper()
+                # fixme, this is too site-specific; let's do something smarter here
+                user = ljuser.match(current_tag)
+                if user: rv += " " + user.groups()[0]
+                if element in block_elements:
+                    rv += " "
+                in_html = False
+                current_tag = ""
+        return rv
+    for line in str.split("\n"):
+#        web.debug("** before html stripping")
+#        web.debug(line)
+        line = remove_html(line)
+#        web.debug("** line")
+#        web.debug(line)
+        words = w_splitter.split(line)
+        words = map(trimmer.match, words)
+        words = filter(None, words)
+        words = map(lambda x: x.groups()[0].lower(), words)
+#        web.debug("** words")
+#        web.debug(words)
+        for word in words:
+            yield word
-	def remove_html(str):
-		"replace any HTML with a single character of whitespace"
-		rv = ""
-		current_tag = ""
-		in_html = False
-		for s in str:
-			if s == "<": in_html = True
-			if in_html: current_tag += s
-			else: rv += s
-			if s == ">":
-				user = ljuser.match(current_tag)
-				if user: rv += " " + user.groups()[0]
-				in_html = False
-				current_tag = ""
-				rv += " "
-		return rv
-	rv = []
-	for line in str.split("\n"):
-#		print "Before:", line
-		line = remove_html(line)
-#		print "Line:", line
-		sentences = s_splitter.split(line)
-		for sentence in sentences:
-			words = w_splitter.split(sentence)
-#			print "sentence is:", sentence
-#			print "words are:", words
-			words = map(trimmer.match, words)
-			words = filter(None, words)
-			words = map(lambda x: x.groups()[0].lower(), words)
-			rv.append(words)
-#		tokens = splitter.split(line)
-#		for token in tokens:
-#			trimmed = trimmer.match(token)
-#			if trimmed: rv.append(trimmed.groups()[0].lower())
-	return rv
-
-class Callback:
-	def __init__(self, queue, article_ids, new_article_ids):
-		self.queue = queue
-		self.article_ids = article_ids
-		self.new_article_ids = new_article_ids
-	def startDocument(self):
-		self.gather_chars = False
-		self.in_item = False
-		self.in_guid = False
-		self.chars, self.guid = "", ""
-	def characters(self, data):
-		if self.gather_chars: self.chars += data
-		elif self.in_guid: self.guid += data
-	def endDocument(self):
-		pass
-	def startElement(self, tag, attrs):
-		if tag == "item":
-			self.in_item = True
-		if tag == "guid":
-			self.guid = ""
-			self.in_guid = True
-		if self.in_item and (tag == "description" or tag == "title"):
-			self.gather_chars = True
-	def endElement(self, tag):
-		if tag == "item":
-			self.in_item = False
-			if self.current_guid and self.current_guid not in self.article_ids:
-				self.new_article_ids.add(self.current_guid)
-		if tag == "guid":
-			self.in_guid = False
-			self.current_guid = sha.new(self.guid).hexdigest()
-		if self.in_item and self.gather_chars and (tag == "description" or tag == "title"):
-			if self.current_guid not in self.article_ids:
-				candidate = tokenize(self.chars)
-				if len(candidate) > 0:
-					self.queue += candidate
-			self.chars = ""
-			self.gather_chars = False
-
-def RSSGenerator(data_generator, article_ids, new_article_ids):
-	queue = []
-	callback = Callback(queue, article_ids, new_article_ids)
-	ctxt = libxml2.createPushParser(callback, "", 0, "")
-	for data in data_generator:
-		ctxt.parseChunk(data, len(data), 0)
-		queue.reverse()
-		while len(queue):
-			yield queue.pop()
-	ctxt.parseChunk("", 0, 1)
-	queue.reverse()
-	while len(queue):
-		yield queue.pop()
-
-def HTTPGenerator(uri):
-	host = urlparse.urlparse(uri)[1]
-	fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-	fd.connect((host, 80))
-	fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))
-	in_headers = True
-	while 1:
-		data = fd.recv(1024)
-		if data == "": break
-		if in_headers:
-			off = data.find("\r\n\r\n")
-			if off <> -1:
-				in_headers = False
-				data = data[off+4:]
-		if not in_headers:
-			yield data