The unified diff between revisions [8c5c8a3d..] and [e331aa42..] is displayed below. It can also be downloaded as a raw diff.
This diff has been restricted to the following files: 'poet3.py'
#
#
# patch "poet3.py"
# from [d4f25eefe3759a6223471ed27257554aa594a1da]
# to [010b19fcb764aa693db735656bbc8a3782480a8e]
#
============================================================
--- poet3.py d4f25eefe3759a6223471ed27257554aa594a1da
+++ poet3.py 010b19fcb764aa693db735656bbc8a3782480a8e
@@ -5,14 +5,18 @@
# ready for deployment on angrygoats.net
#
-from generators import RSSGenerator, HTTPGenerator
+from generators import tokenize
from storage import Storage
from syllables import Syllables
from symbolstate import SymbolState
-import os
+
+import feedparser
+import datetime
+import cPickle
import config
-import cPickle
+import sha
import web
+import os
def upwrite(generator, to_upwrite):
"""upwrite a sequence of tokens.
@@ -47,30 +51,99 @@ class Poet3:
yield token
class Poet3:
- def __init__(self, uri):
- self.uri = uri
- self.storage = Storage(self.uri)
+ def __init__(self, site, uri):
+ self.site, self.uri = site, uri
+ self.storage = Storage(self.site, self.uri)
self.syllables = Syllables()
+
if self.storage.has_file('upwritten.txt'):
self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt'))
else:
self.upwritten_seqs = None
+ self.update()
+ self.build_state()
+
+ # if there has been no upwriting yet, then let's do some now.
+ # we save the resulting combined tokens, so the code uses those
+ # when adding new tokens to the corpus from now on
+ if not self.upwritten_seqs:
+ to_upwrite = self.symbol_state.chunkable()
+ cPickle.dump(to_upwrite, self.storage.open('upwritten.txt', 'w'))
+ self.upwrite(to_upwrite)
+
+ def has_data(self):
+ return self.symbol_state.forward_markov.total > 0
+
def update(self):
+ # to keep traffic down, we'll only check for updates every so often
+ last_update = self.storage.mtime('last_update')
+ if datetime.datetime.now() - last_update < config.update_interval:
+ pass
+# return
+ self.storage.timestamp('last_update')
+
self.storage.require_files(('articles.txt', 'corpus.txt'))
+ articles = set((t.strip() for t in self.storage.open('articles.txt')))
+ new_articles = set()
- articles, new_articles = set(), set()
- for line in (t.strip() for t in self.storage.open('articles.txt')):
- articles.add(line)
+ # do we have an e-tag or last-modified?
+ etag = last_modified = None
+ if self.storage.has_file('etag.txt'):
+ etag = self.storage.open('etag.txt').readline().strip()
+ elif self.storage.has_file('last-modified.txt'):
+ try:
+ last_modified = cPickle.load(self.storage.open('last-modified.txt'))
+ except:
+ self.storage.unlink('last-modified.txt')
+
+ fp = feedparser.parse(self.uri, modified=last_modified, etag=etag)
+ if hasattr(fp, "status"):
+ if fp.status == 200:
+ # okay
+ pass
+ elif fp.status == 304:
+ # okay, just not modified since last time; return
+ return
+ elif fp.status == 403:
+ raise Exception("Unable to retrieve RSS feed: permission denied.")
+ elif fp.status == 404:
+ raise Exception("Unable to retrieve RSS feed: not found.")
+ else:
+ raise Exception("Unable to retrieve RSS feed: HTTP error code %d" % (fp.status))
+
+ def fp_generator():
+ for entry in fp['entries']:
+ if hasattr(entry, "guid") and entry.guid:
+ article = sha.new(entry.guid).hexdigest()
+ if article in articles:
+ continue
+ else:
+ articles.add(article)
+ new_articles.add(article)
+ to_tokenize = []
+ if hasattr(entry, "title"):
+ to_tokenize.append("title")
+ if hasattr(entry, "description"):
+ to_tokenize.append("description")
+ for attr in to_tokenize:
+ for token in tokenize(getattr(entry, attr)):
+ yield token
+ # update last-modified and etag; FIXME what happens if status is 304?
+ if hasattr(fp, "etag") and fp.etag:
+ self.storage.open("etag.txt", "w").write(fp.etag+"\n")
+ else:
+ self.storage.unlink("etag.txt")
+ if hasattr(fp, "modified") and fp.modified:
+ cPickle.dump(fp.modified, self.storage.open("last-modified.txt", "w"))
+ else:
+ self.storage.unlink("last-modified.txt")
+
corpus_fd = self.storage.open('corpus.txt', 'a')
- generator = RSSGenerator(HTTPGenerator(self.uri), articles, new_articles)
- # except that we don't actually *want* token sequences any more; we don't care
- # and it breaks the upwriting generator.
- generator = remove_token_sequences(generator)
+ generator = fp_generator()
if self.upwritten_seqs:
- generator = remove_token_sequences(upwrite(generator, self.upwritten_seqs))
-
+ generator = upwrite(generator, self.upwritten_seqs)
for token in generator:
corpus_fd.write(token+"\n")
corpus_fd.close()