The unified diff between revisions [8c5c8a3d..] and [e331aa42..] is displayed below. It can also be downloaded as a raw diff.

This diff has been restricted to the following files: 'poet3.py'

#
#
# patch "poet3.py"
#  from [d4f25eefe3759a6223471ed27257554aa594a1da]
#    to [010b19fcb764aa693db735656bbc8a3782480a8e]
#
============================================================
--- poet3.py	d4f25eefe3759a6223471ed27257554aa594a1da
+++ poet3.py	010b19fcb764aa693db735656bbc8a3782480a8e
@@ -5,14 +5,18 @@
 # ready for deployment on angrygoats.net
 #

-from generators import RSSGenerator, HTTPGenerator
+from generators import tokenize
 from storage import Storage
 from syllables import Syllables
 from symbolstate import SymbolState
-import os
+
+import feedparser
+import datetime
+import cPickle
 import config
-import cPickle
+import sha
 import web
+import os

 def upwrite(generator, to_upwrite):
     """upwrite a sequence of tokens.
@@ -47,30 +51,99 @@ class Poet3:
             yield token

 class Poet3:
-    def __init__(self, uri):
-        self.uri = uri
-        self.storage = Storage(self.uri)
+    def __init__(self, site, uri):
+        self.site, self.uri = site, uri
+        self.storage = Storage(self.site, self.uri)
         self.syllables = Syllables()
+
         if self.storage.has_file('upwritten.txt'):
             self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt'))
         else:
             self.upwritten_seqs = None

+        self.update()
+        self.build_state()
+
+        # if there has been no upwriting yet, then let's do some now.
+        # we save the resulting combined tokens, so the code uses those
+        # when adding new tokens to the corpus from now on
+        if not self.upwritten_seqs:
+            to_upwrite = self.symbol_state.chunkable()
+            cPickle.dump(to_upwrite, self.storage.open('upwritten.txt', 'w'))
+            self.upwrite(to_upwrite)
+
+    def has_data(self):
+        return self.symbol_state.forward_markov.total > 0
+
     def update(self):
+        # to keep traffic down, we'll only check for updates every so often
+        last_update = self.storage.mtime('last_update')
+        if datetime.datetime.now() - last_update < config.update_interval:
+            pass
+#            return
+        self.storage.timestamp('last_update')
+
         self.storage.require_files(('articles.txt', 'corpus.txt'))
+        articles = set((t.strip() for t in self.storage.open('articles.txt')))
+        new_articles = set()

-        articles, new_articles = set(), set()
-        for line in (t.strip() for t in self.storage.open('articles.txt')):
-            articles.add(line)
+        # do we have an e-tag or last-modified?
+        etag = last_modified = None
+        if self.storage.has_file('etag.txt'):
+            etag = self.storage.open('etag.txt').readline().strip()
+        elif self.storage.has_file('last-modified.txt'):
+            try:
+                last_modified = cPickle.load(self.storage.open('last-modified.txt'))
+            except:
+                self.storage.unlink('last-modified.txt')
+
+        fp = feedparser.parse(self.uri, modified=last_modified, etag=etag)
+        if hasattr(fp, "status"):
+            if fp.status == 200:
+                # okay
+                pass
+            elif fp.status == 304:
+                # okay, just not modified since last time; return
+                return
+            elif fp.status == 403:
+                raise Exception("Unable to retrieve RSS feed: permission denied.")
+            elif fp.status == 404:
+                raise Exception("Unable to retrieve RSS feed: not found.")
+            else:
+                raise Exception("Unable to retrieve RSS feed: HTTP error code %d" % (fp.status))
+
+        def fp_generator():
+            for entry in fp['entries']:
+                if hasattr(entry, "guid") and entry.guid:
+                    article = sha.new(entry.guid).hexdigest()
+                    if article in articles:
+                        continue
+                    else:
+                        articles.add(article)
+                        new_articles.add(article)
+                to_tokenize = []
+                if hasattr(entry, "title"):
+                    to_tokenize.append("title")
+                if hasattr(entry, "description"):
+                    to_tokenize.append("description")
+                for attr in to_tokenize:
+                    for token in tokenize(getattr(entry, attr)):
+                        yield token

+        # update last-modified and etag; FIXME what happens if status is 304?
+        if hasattr(fp, "etag") and fp.etag:
+            self.storage.open("etag.txt", "w").write(fp.etag+"\n")
+        else:
+            self.storage.unlink("etag.txt")
+        if hasattr(fp, "modified") and fp.modified:
+            cPickle.dump(fp.modified, self.storage.open("last-modified.txt", "w"))
+        else:
+            self.storage.unlink("last-modified.txt")
+
         corpus_fd = self.storage.open('corpus.txt', 'a')
-        generator = RSSGenerator(HTTPGenerator(self.uri), articles, new_articles)
-        # except that we don't actually *want* token sequences any more; we don't care
-        # and it breaks the upwriting generator.
-        generator = remove_token_sequences(generator)
+        generator = fp_generator()
         if self.upwritten_seqs:
-            generator = remove_token_sequences(upwrite(generator, self.upwritten_seqs))
-
+            generator = upwrite(generator, self.upwritten_seqs)
         for token in generator:
             corpus_fd.write(token+"\n")
         corpus_fd.close()