Below is the file 'poet3.py' from this revision. You can also download the file.

#!/usr/bin/env python

#
# take poet.py and make it fast / sensible,
# ready for deployment on angrygoats.net
#

from generators import RSSGenerator, HTTPGenerator
from storage import Storage
from syllables import Syllables
from symbolstate import SymbolState
import os
import config
import cPickle
import web

def upwrite(generator, to_upwrite):
    """upwrite a sequence of tokens.
       to_upwrite is a list of sequences to be upwritten.
       assumption: all sequences to be upwritten are equal length.
       """
    seq_size = len(to_upwrite[0])
    matches = [0] * len(to_upwrite)
    buffer = []
    for token in (t.strip() for t in generator):
        buffer.append(token)
        for i, seq in enumerate(to_upwrite):
            if token == seq[matches[i]]:
                matches[i] += 1
            else:
                matches[i] = 0

            if matches[i] == seq_size:
                yield [' '.join(to_upwrite[i])]
                matches = [0] * len(to_upwrite)
                buffer = []
                break
        longest_match = max(matches)
        if len(buffer) > longest_match:
            output_idx = len(buffer)-longest_match
            output, buffer = buffer[:output_idx], buffer[output_idx:]
            yield output

def remove_token_sequences(generator):
    for token_seq in generator:
        for token in token_seq:
            yield token

class Poet3:
    def __init__(self, uri):
        self.uri = uri
        self.storage = Storage(self.uri)
        self.syllables = Syllables()
        if self.storage.has_file('upwritten.txt'):
            self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt'))
        else:
            self.upwritten_seqs = None

    def update(self):
        self.storage.require_files(('articles.txt', 'corpus.txt'))

        articles, new_articles = set(), set()
        for line in (t.strip() for t in self.storage.open('articles.txt')):
            articles.add(line)

        corpus_fd = self.storage.open('corpus.txt', 'a')
        generator = RSSGenerator(HTTPGenerator(self.uri), articles, new_articles)
        # except that we don't actually *want* token sequences any more; we don't care
        # and it breaks the upwriting generator.
        generator = remove_token_sequences(generator)
        if self.upwritten_seqs:
            generator = remove_token_sequences(upwrite(generator, self.upwritten_seqs))

        for token in generator:
            corpus_fd.write(token+"\n")
        corpus_fd.close()

        articles_fd = self.storage.open('articles.txt', 'a')
        for article in new_articles:
            articles_fd.write(article+"\n")
        articles_fd.close()

    def build_state(self):
        self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt')))
        self.symbol_state.update()

    def upwrite(self, to_upwrite):
        if len(to_upwrite) == 0:
            return

        corpus_fd = self.storage.open('corpus.txt')
        upwrite_fd = self.storage.open('upwrite.txt', 'w')

        for token_list in upwrite((t.strip() for t in corpus_fd), to_upwrite):
            for token in token_list:
                upwrite_fd.write(token+'\n')
        upwrite_fd.close()

        os.rename(self.storage.file('upwrite.txt'),
                  self.storage.file('corpus.txt'))