Below is the file 'poet3.py' from this revision. You can also download the file.
#!/usr/bin/env python # # take poet.py and make it fast / sensible, # ready for deployment on angrygoats.net # from generators import RSSGenerator, HTTPGenerator from storage import Storage from syllables import Syllables from symbolstate import SymbolState import os import config import cPickle import web def upwrite(generator, to_upwrite): """upwrite a sequence of tokens. to_upwrite is a list of sequences to be upwritten. assumption: all sequences to be upwritten are equal length. """ seq_size = len(to_upwrite[0]) matches = [0] * len(to_upwrite) buffer = [] for token in (t.strip() for t in generator): buffer.append(token) for i, seq in enumerate(to_upwrite): if token == seq[matches[i]]: matches[i] += 1 else: matches[i] = 0 if matches[i] == seq_size: yield [' '.join(to_upwrite[i])] matches = [0] * len(to_upwrite) buffer = [] break longest_match = max(matches) if len(buffer) > longest_match: output_idx = len(buffer)-longest_match output, buffer = buffer[:output_idx], buffer[output_idx:] yield output def remove_token_sequences(generator): for token_seq in generator: for token in token_seq: yield token class Poet3: def __init__(self, uri): self.uri = uri self.storage = Storage(self.uri) self.syllables = Syllables() if self.storage.has_file('upwritten.txt'): self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt')) else: self.upwritten_seqs = None def update(self): self.storage.require_files(('articles.txt', 'corpus.txt')) articles, new_articles = set(), set() for line in (t.strip() for t in self.storage.open('articles.txt')): articles.add(line) corpus_fd = self.storage.open('corpus.txt', 'a') generator = RSSGenerator(HTTPGenerator(self.uri), articles, new_articles) # except that we don't actually *want* token sequences any more; we don't care # and it breaks the upwriting generator. generator = remove_token_sequences(generator) if self.upwritten_seqs: generator = remove_token_sequences(upwrite(generator, self.upwritten_seqs)) for token in generator: corpus_fd.write(token+"\n") corpus_fd.close() articles_fd = self.storage.open('articles.txt', 'a') for article in new_articles: articles_fd.write(article+"\n") articles_fd.close() def build_state(self): self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt'))) self.symbol_state.update() def upwrite(self, to_upwrite): if len(to_upwrite) == 0: return corpus_fd = self.storage.open('corpus.txt') upwrite_fd = self.storage.open('upwrite.txt', 'w') for token_list in upwrite((t.strip() for t in corpus_fd), to_upwrite): for token in token_list: upwrite_fd.write(token+'\n') upwrite_fd.close() os.rename(self.storage.file('upwrite.txt'), self.storage.file('corpus.txt'))