Below is the file 'poet3.py' from this revision. You can also download the file.

#!/usr/bin/env python

#
# take poet.py and make it fast / sensible,
# ready for deployment on angrygoats.net
#

from generators import tokenize
from storage import Storage
from syllables import Syllables
from symbolstate import SymbolState

import feedparser
import datetime
import cPickle
import config
import sha
import web
import os

def upwrite(generator, to_upwrite):
    """upwrite a sequence of tokens.
       to_upwrite is a list of sequences to be upwritten.
       assumption: all sequences to be upwritten are equal length.
       """
    seq_size = len(to_upwrite[0])
    matches = [0] * len(to_upwrite)
    buffer = []
    for token in (t.strip() for t in generator):
        buffer.append(token)
        for i, seq in enumerate(to_upwrite):
            if token == seq[matches[i]]:
                matches[i] += 1
            else:
                matches[i] = 0

            if matches[i] == seq_size:
                yield [' '.join(to_upwrite[i])]
                matches = [0] * len(to_upwrite)
                buffer = []
                break
        longest_match = max(matches)
        if len(buffer) > longest_match:
            output_idx = len(buffer)-longest_match
            output, buffer = buffer[:output_idx], buffer[output_idx:]
            yield output

def remove_token_sequences(generator):
    for token_seq in generator:
        for token in token_seq:
            yield token

class Poet3:
    def __init__(self, site, uri):
        self.site, self.uri = site, uri
        self.storage = Storage(self.site, self.uri)
        self.syllables = Syllables()

        if self.storage.has_file('upwritten.txt'):
            self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt'))
        else:
            self.upwritten_seqs = None

        self.update()
        self.build_state()

        # if there has been no upwriting yet, then let's do some now.
        # we save the resulting combined tokens, so the code uses those
        # when adding new tokens to the corpus from now on
        if not self.upwritten_seqs:
            to_upwrite = self.symbol_state.chunkable()
            cPickle.dump(to_upwrite, self.storage.open('upwritten.txt', 'w'))
            self.upwrite(to_upwrite)

    def has_data(self):
        return self.symbol_state.forward_markov.total > 0

    def update(self):
        # to keep traffic down, we'll only check for updates every so often
        last_update = self.storage.mtime('last_update')
        if datetime.datetime.now() - last_update < config.update_interval:
            pass
#            return
        self.storage.timestamp('last_update')

        self.storage.require_files(('articles.txt', 'corpus.txt'))
        articles = set((t.strip() for t in self.storage.open('articles.txt')))
        new_articles = set()

        # do we have an e-tag or last-modified?
        etag = last_modified = None
        if self.storage.has_file('etag.txt'):
            etag = self.storage.open('etag.txt').readline().strip()
        elif self.storage.has_file('last-modified.txt'):
            try:
                last_modified = cPickle.load(self.storage.open('last-modified.txt'))
            except:
                self.storage.unlink('last-modified.txt')

        fp = feedparser.parse(self.uri, modified=last_modified, etag=etag)
        if hasattr(fp, "status"):
            if fp.status == 200:
                # okay
                pass
            elif fp.status == 304:
                # okay, just not modified since last time; return
                return
            elif fp.status == 403:
                raise Exception("Unable to retrieve RSS feed: permission denied.")
            elif fp.status == 404:
                raise Exception("Unable to retrieve RSS feed: not found.")
            else:
                raise Exception("Unable to retrieve RSS feed: HTTP error code %d" % (fp.status))

        def fp_generator():
            for entry in fp['entries']:
                if hasattr(entry, "guid") and entry.guid:
                    article = sha.new(entry.guid).hexdigest()
                    if article in articles:
                        continue
                    else:
                        articles.add(article)
                        new_articles.add(article)
                to_tokenize = []
                if hasattr(entry, "title"):
                    to_tokenize.append("title")
                if hasattr(entry, "description"):
                    to_tokenize.append("description")
                for attr in to_tokenize:
                    for token in tokenize(getattr(entry, attr)):
                        yield token

        # update last-modified and etag; FIXME what happens if status is 304?
        if hasattr(fp, "etag") and fp.etag:
            self.storage.open("etag.txt", "w").write(fp.etag+"\n")
        else:
            self.storage.unlink("etag.txt")
        if hasattr(fp, "modified") and fp.modified:
            cPickle.dump(fp.modified, self.storage.open("last-modified.txt", "w"))
        else:
            self.storage.unlink("last-modified.txt")

        corpus_fd = self.storage.open('corpus.txt', 'a')
        generator = fp_generator()
        if self.upwritten_seqs:
            generator = upwrite(generator, self.upwritten_seqs)
        for token in generator:
            corpus_fd.write(token+"\n")
        corpus_fd.close()

        articles_fd = self.storage.open('articles.txt', 'a')
        for article in new_articles:
            articles_fd.write(article+"\n")
        articles_fd.close()

    def build_state(self):
        self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt')))
        self.symbol_state.update()

    def upwrite(self, to_upwrite):
        if len(to_upwrite) == 0:
            return

        corpus_fd = self.storage.open('corpus.txt')
        upwrite_fd = self.storage.open('upwrite.txt', 'w')

        for token_list in upwrite((t.strip() for t in corpus_fd), to_upwrite):
            for token in token_list:
                upwrite_fd.write(token+'\n')
        upwrite_fd.close()

        os.rename(self.storage.file('upwrite.txt'),
                  self.storage.file('corpus.txt'))