The unified diff between revisions [8c5c8a3d..] and [e331aa42..] is displayed below. It can also be downloaded as a raw diff.
#
#
# delete "poet.py"
#
# patch "config.py"
# from [2532e16eeb23781130fb73fc44f0ea8c78037dff]
# to [3b32f26bb0a045f902ded27d5c8049eeb41e0b1c]
#
# patch "generators.py"
# from [1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592]
# to [95ddf524e1ba9419c070b13870fb6427dbe198ec]
#
# patch "poet3.py"
# from [d4f25eefe3759a6223471ed27257554aa594a1da]
# to [010b19fcb764aa693db735656bbc8a3782480a8e]
#
# patch "poetweb.py"
# from [b7f62177c811a28243a96a0921525f950120e23c]
# to [4ead8fdbbb9b582210fe226b0fc04022284712b2]
#
# patch "storage.py"
# from [15df0aaa6f37d41868a4e7b75beb834d7caea7e2]
# to [42dc917e0234c29279bfcbc1c6756f5cfc965446]
#
# patch "templates/base.html"
# from [8f3545d98812a0f20d84551a7471b15a2c8152d4]
# to [a9406028ffbbd2e3ec5419e396064ebdf0dc11be]
#
# patch "templates/haiku.html"
# from [e18c4556feb6de7c6c4a7a7e24dcae959075f3a0]
# to [b030c371938ec940a55358dce11ab28b80d10392]
#
============================================================
--- config.py 2532e16eeb23781130fb73fc44f0ea8c78037dff
+++ config.py 3b32f26bb0a045f902ded27d5c8049eeb41e0b1c
@@ -2,7 +2,7 @@ import os
import datetime
import os
-install_path = '/home/grahame/monotone/memes/'
+install_path = '/Users/grahame/monotone/memes/'
storage_path = os.path.join(install_path, 'storage')
word_db = os.path.join(install_path, 'word.db')
============================================================
--- generators.py 1ec26b4d1209e302e47cfb0b1b6f2bd3e857e592
+++ generators.py 95ddf524e1ba9419c070b13870fb6427dbe198ec
@@ -1,119 +1,48 @@ import re
#!/usr/bin/env python
import re
-import sha
-import socket
-import libxml2
-import urlparse
+import web
-s_splitter = re.compile(r'[\.\!]+')
+#s_splitter = re.compile(r'[\.\!]+')
w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
trimmer = re.compile(r'([A-Za-z0-9\']+)')
ljuser = re.compile(r'.*span class=[\'\"]ljuser[\"\'].*<a href=[\"\']http:\/\/www\.livejournal\.com\/users\/([A-Za-z0-9\-\_]+)')
+# source: http://www.w3.org/TR/html4/sgml/loosedtd.html#block
+block_elements = set(["P", "DL", "DIV", "CENTER", "NOSCRIPT", "NOFRAMES", "BLOCKQUOTE", "FORM", "ISINDEX", "HR", "TABLE", "FIELDSET", "ADDRESS", "H1", "H2", "H3", "H4", "H5", "H6", "UL", "OL", "DIR", "MENU", "PRE", "BR"])
def tokenize(str):
+ def remove_html(str):
+ """replace any HTML with a single character of whitespace (in the case of a block-level element)
+ or the empty string otherwise.
+ """
+ rv = ""
+ current_tag = ""
+ in_html = False
+ for s in str:
+ if s == "<": in_html = True
+ if in_html: current_tag += s
+ else: rv += s
+ if s == ">":
+ element = current_tag.lstrip(' <').split(' ', 1)[0].upper()
+ # fixme, this is too site-specific; let's do something smarter here
+ user = ljuser.match(current_tag)
+ if user: rv += " " + user.groups()[0]
+ if element in block_elements:
+ rv += " "
+ in_html = False
+ current_tag = ""
+ return rv
+ for line in str.split("\n"):
+# web.debug("** before html stripping")
+# web.debug(line)
+ line = remove_html(line)
+# web.debug("** line")
+# web.debug(line)
+ words = w_splitter.split(line)
+ words = map(trimmer.match, words)
+ words = filter(None, words)
+ words = map(lambda x: x.groups()[0].lower(), words)
+# web.debug("** words")
+# web.debug(words)
+ for word in words:
+ yield word
- def remove_html(str):
- "replace any HTML with a single character of whitespace"
- rv = ""
- current_tag = ""
- in_html = False
- for s in str:
- if s == "<": in_html = True
- if in_html: current_tag += s
- else: rv += s
- if s == ">":
- user = ljuser.match(current_tag)
- if user: rv += " " + user.groups()[0]
- in_html = False
- current_tag = ""
- rv += " "
- return rv
- rv = []
- for line in str.split("\n"):
-# print "Before:", line
- line = remove_html(line)
-# print "Line:", line
- sentences = s_splitter.split(line)
- for sentence in sentences:
- words = w_splitter.split(sentence)
-# print "sentence is:", sentence
-# print "words are:", words
- words = map(trimmer.match, words)
- words = filter(None, words)
- words = map(lambda x: x.groups()[0].lower(), words)
- rv.append(words)
-# tokens = splitter.split(line)
-# for token in tokens:
-# trimmed = trimmer.match(token)
-# if trimmed: rv.append(trimmed.groups()[0].lower())
- return rv
-
-class Callback:
- def __init__(self, queue, article_ids, new_article_ids):
- self.queue = queue
- self.article_ids = article_ids
- self.new_article_ids = new_article_ids
- def startDocument(self):
- self.gather_chars = False
- self.in_item = False
- self.in_guid = False
- self.chars, self.guid = "", ""
- def characters(self, data):
- if self.gather_chars: self.chars += data
- elif self.in_guid: self.guid += data
- def endDocument(self):
- pass
- def startElement(self, tag, attrs):
- if tag == "item":
- self.in_item = True
- if tag == "guid":
- self.guid = ""
- self.in_guid = True
- if self.in_item and (tag == "description" or tag == "title"):
- self.gather_chars = True
- def endElement(self, tag):
- if tag == "item":
- self.in_item = False
- if self.current_guid and self.current_guid not in self.article_ids:
- self.new_article_ids.add(self.current_guid)
- if tag == "guid":
- self.in_guid = False
- self.current_guid = sha.new(self.guid).hexdigest()
- if self.in_item and self.gather_chars and (tag == "description" or tag == "title"):
- if self.current_guid not in self.article_ids:
- candidate = tokenize(self.chars)
- if len(candidate) > 0:
- self.queue += candidate
- self.chars = ""
- self.gather_chars = False
-
-def RSSGenerator(data_generator, article_ids, new_article_ids):
- queue = []
- callback = Callback(queue, article_ids, new_article_ids)
- ctxt = libxml2.createPushParser(callback, "", 0, "")
- for data in data_generator:
- ctxt.parseChunk(data, len(data), 0)
- queue.reverse()
- while len(queue):
- yield queue.pop()
- ctxt.parseChunk("", 0, 1)
- queue.reverse()
- while len(queue):
- yield queue.pop()
-
-def HTTPGenerator(uri):
- host = urlparse.urlparse(uri)[1]
- fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- fd.connect((host, 80))
- fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))
- in_headers = True
- while 1:
- data = fd.recv(1024)
- if data == "": break
- if in_headers:
- off = data.find("\r\n\r\n")
- if off <> -1:
- in_headers = False
- data = data[off+4:]
- if not in_headers:
- yield data
============================================================
--- poet3.py d4f25eefe3759a6223471ed27257554aa594a1da
+++ poet3.py 010b19fcb764aa693db735656bbc8a3782480a8e
@@ -5,14 +5,18 @@
# ready for deployment on angrygoats.net
#
-from generators import RSSGenerator, HTTPGenerator
+from generators import tokenize
from storage import Storage
from syllables import Syllables
from symbolstate import SymbolState
-import os
+
+import feedparser
+import datetime
+import cPickle
import config
-import cPickle
+import sha
import web
+import os
def upwrite(generator, to_upwrite):
"""upwrite a sequence of tokens.
@@ -47,30 +51,99 @@ class Poet3:
yield token
class Poet3:
- def __init__(self, uri):
- self.uri = uri
- self.storage = Storage(self.uri)
+ def __init__(self, site, uri):
+ self.site, self.uri = site, uri
+ self.storage = Storage(self.site, self.uri)
self.syllables = Syllables()
+
if self.storage.has_file('upwritten.txt'):
self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt'))
else:
self.upwritten_seqs = None
+ self.update()
+ self.build_state()
+
+ # if there has been no upwriting yet, then let's do some now.
+ # we save the resulting combined tokens, so the code uses those
+ # when adding new tokens to the corpus from now on
+ if not self.upwritten_seqs:
+ to_upwrite = self.symbol_state.chunkable()
+ cPickle.dump(to_upwrite, self.storage.open('upwritten.txt', 'w'))
+ self.upwrite(to_upwrite)
+
+ def has_data(self):
+ return self.symbol_state.forward_markov.total > 0
+
def update(self):
+ # to keep traffic down, we'll only check for updates every so often
+ last_update = self.storage.mtime('last_update')
+ if datetime.datetime.now() - last_update < config.update_interval:
+ pass
+# return
+ self.storage.timestamp('last_update')
+
self.storage.require_files(('articles.txt', 'corpus.txt'))
+ articles = set((t.strip() for t in self.storage.open('articles.txt')))
+ new_articles = set()
- articles, new_articles = set(), set()
- for line in (t.strip() for t in self.storage.open('articles.txt')):
- articles.add(line)
+ # do we have an e-tag or last-modified?
+ etag = last_modified = None
+ if self.storage.has_file('etag.txt'):
+ etag = self.storage.open('etag.txt').readline().strip()
+ elif self.storage.has_file('last-modified.txt'):
+ try:
+ last_modified = cPickle.load(self.storage.open('last-modified.txt'))
+ except:
+ self.storage.unlink('last-modified.txt')
+
+ fp = feedparser.parse(self.uri, modified=last_modified, etag=etag)
+ if hasattr(fp, "status"):
+ if fp.status == 200:
+ # okay
+ pass
+ elif fp.status == 304:
+ # okay, just not modified since last time; return
+ return
+ elif fp.status == 403:
+ raise Exception("Unable to retrieve RSS feed: permission denied.")
+ elif fp.status == 404:
+ raise Exception("Unable to retrieve RSS feed: not found.")
+ else:
+ raise Exception("Unable to retrieve RSS feed: HTTP error code %d" % (fp.status))
+
+ def fp_generator():
+ for entry in fp['entries']:
+ if hasattr(entry, "guid") and entry.guid:
+ article = sha.new(entry.guid).hexdigest()
+ if article in articles:
+ continue
+ else:
+ articles.add(article)
+ new_articles.add(article)
+ to_tokenize = []
+ if hasattr(entry, "title"):
+ to_tokenize.append("title")
+ if hasattr(entry, "description"):
+ to_tokenize.append("description")
+ for attr in to_tokenize:
+ for token in tokenize(getattr(entry, attr)):
+ yield token
+ # update last-modified and etag; FIXME what happens if status is 304?
+ if hasattr(fp, "etag") and fp.etag:
+ self.storage.open("etag.txt", "w").write(fp.etag+"\n")
+ else:
+ self.storage.unlink("etag.txt")
+ if hasattr(fp, "modified") and fp.modified:
+ cPickle.dump(fp.modified, self.storage.open("last-modified.txt", "w"))
+ else:
+ self.storage.unlink("last-modified.txt")
+
corpus_fd = self.storage.open('corpus.txt', 'a')
- generator = RSSGenerator(HTTPGenerator(self.uri), articles, new_articles)
- # except that we don't actually *want* token sequences any more; we don't care
- # and it breaks the upwriting generator.
- generator = remove_token_sequences(generator)
+ generator = fp_generator()
if self.upwritten_seqs:
- generator = remove_token_sequences(upwrite(generator, self.upwritten_seqs))
-
+ generator = upwrite(generator, self.upwritten_seqs)
for token in generator:
corpus_fd.write(token+"\n")
corpus_fd.close()
============================================================
--- poetweb.py b7f62177c811a28243a96a0921525f950120e23c
+++ poetweb.py 4ead8fdbbb9b582210fe226b0fc04022284712b2
@@ -6,14 +6,13 @@
# All rights reserved.
#
-import datetime
-import urlparse
-import web
from poet3 import Poet3
from forms import haiku
-import cPickle
+
+import urlparse
import config
import syslog
+import web
import re
# for debugging, logging of referrers, etc.
@@ -32,6 +31,7 @@ class NewLivejournal:
return 'http://users.%s/%s/' % (site, username)
else:
return 'http://%s.%s/' % (username, site)
+
@classmethod
def rss_uri(self, site, username):
return NewLivejournal.user_uri(site, username) + 'data/rss'
@@ -39,6 +39,7 @@ rss_lookup = {
rss_lookup = {
'livejournal.com' : NewLivejournal,
'deadjournal.com' : NewLivejournal,
+ 'greatestjournal.com' : NewLivejournal,
}
urls = (
@@ -49,32 +50,6 @@ urls = (
'/robots.txt', 'RobotsView',
)
-def gen_haiku(rss_uri):
- poet = Poet3(rss_uri)
- last_update = poet.storage.mtime('last_update')
- if datetime.datetime.now() - last_update > config.update_interval:
- poet.update()
- # FIXME this doesn't work..
- poet.storage.open('last_update', 'w').write('')
- poet.build_state()
- if poet.symbol_state.forward_markov.total == 0:
- return -1, None
-
- # if there has been no upwriting yet, then let's do some now.
- # we save the resulting combined tokens, so the code uses those
- # when adding new tokens to the corpus from now on
- if not poet.upwritten_seqs:
- to_upwrite = poet.symbol_state.chunkable()
- cPickle.dump(to_upwrite, poet.storage.open('upwritten.txt', 'w'))
- poet.upwrite(to_upwrite)
-
- for attempt in xrange(config.haiku_attempts):
- poem = haiku(poet)
- if poem != None:
- break
-
- return attempt, poem
-
class Renderer:
def __init__(self):
# any templates that can be inherited from, should be added to the list here
@@ -107,14 +82,25 @@ class HaikuView:
haiku_base = '''<form action="%(dynamic_uri_path)spost/haiku" method="post"><table align="center" border="0" bgcolor="#DDDDFF" style="border: 1px solid black;"><tr><th>LJ-Haiku<sup>2</sup> for %(username)s</th></tr><tr><td><blockquote align="right" style="text-align:right;border-right:1px solid #808080; padding:5px;"> %(haiku)s</blockquote></td></tr><tr><td align="center"><input type="text" size=8 name="haiku_username" value="%(username)s" /> @ <select name="haiku_server"><option selected value="%(mysite)s">%(mysite)s</option><option value="x">Other...</option></select><input value="%(username)s" type="hidden" name="haiku_referrer" /></td></tr><tr><td align="center"><input type="submit" value="What's my Haiku?"></td></tr><tr><td align="center" bgcolor="#CCCCCC"><small><a href="http://www.livejournal.com/users/grahame/">Created by Grahame</a></small></td></tr></table></form>'''
class HaikuView:
+ def gen_haiku(self, site, rss_uri):
+ poet = Poet3(site, rss_uri)
+ if not poet.has_data():
+ return -1, None
+
+ for attempt in xrange(config.haiku_attempts):
+ poem = haiku(poet)
+ if poem != None:
+ break
+
+ return attempt, poem
+
def GET(self, site, username):
if not rss_lookup.has_key(site):
## fixme; this is a site we don't know about; let's return a friendly
## page suggesting the user might want to ask about adding it
return web.notfound()
- uri = rss_lookup[site].rss_uri(site, username)
- web.debug(uri)
- attempts, haiku = gen_haiku(uri)
+ rss_uri = rss_lookup[site].rss_uri(site, username)
+ attempts, haiku = self.gen_haiku(site, rss_uri)
if haiku == None:
haiku_code = None
else:
@@ -122,7 +108,6 @@ class HaikuView:
'dynamic_uri_path' : config.dynamic_uri_path,
'username' : username,
'haiku' : '<br />'.join(' '.join(t) for t in haiku) }
- web.debug("haiku_code=%s" % haiku_code)
renderer.render("haiku.html",
attempts=attempts,
page_title="your haiku",
@@ -132,6 +117,7 @@ class PostMemeView:
class PostMemeView:
def GET(self, meme_name):
web.seeother(config.dynamic_uri_path)
+
def POST(self, meme_name):
meme_name = meme_name.lower()
i = web.input()
============================================================
--- storage.py 15df0aaa6f37d41868a4e7b75beb834d7caea7e2
+++ storage.py 42dc917e0234c29279bfcbc1c6756f5cfc965446
@@ -12,14 +12,14 @@ class Storage:
pass
class Storage:
- def __init__(self, uri):
- self.uri = uri
+ def __init__(self, site, uri):
+ self.site, self.uri = site, uri
self.dir = self.__storage_dir()
def __storage_dir(self):
# list of directories in the storage path to get to this URI
dirs = []
- site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(self.uri)[1])
+ site = filter(lambda x: x in string.letters or x in string.digits or x == '.', self.site)
# no .. entries to climb the filesystem :-)
site = site.lstrip('.')
dirs.append(site)
@@ -48,6 +48,18 @@ class Storage:
timestamp = os.stat(fname)[stat.ST_MTIME]
return datetime.datetime.fromtimestamp(timestamp)
+ def timestamp(self, fname):
+ fname = self.file(fname)
+ if not os.access(fname, os.R_OK):
+ open(fname, 'w')
+ os.utime(fname, None)
+
+ def unlink(self, fname):
+ try:
+ os.unlink(self.file(fname))
+ except OSError:
+ pass
+
def has_file(self, fname):
return os.access(self.file(fname), os.R_OK)
============================================================
--- templates/base.html 8f3545d98812a0f20d84551a7471b15a2c8152d4
+++ templates/base.html a9406028ffbbd2e3ec5419e396064ebdf0dc11be
@@ -22,9 +22,7 @@
</div>
<h1 id="pageTitle">memes.angrygoats.net</h1>
-
#block body
#end block
-
</body>
</html>
============================================================
--- templates/haiku.html e18c4556feb6de7c6c4a7a7e24dcae959075f3a0
+++ templates/haiku.html b030c371938ec940a55358dce11ab28b80d10392
@@ -18,7 +18,8 @@ $haiku_code
#filter WebSafe
<p align="center">
- Want to add this to your journal? Just copy the following code into your journal.
+ Want to add this to your journal? Just copy the following code into your journal.<br />
+ Want another? Just <a href="javascript:window.location.reload();">reload</a>.
</p>
<p style='border-style: solid; width: 50%; position: relative; left: 25%; padding: 3px; border-width: 2; border-color: green;'>
<small><small><code>