Below is the file 'poet3.py' from this revision. You can also download the file.
#!/usr/bin/env python # # take poet.py and make it fast / sensible, # ready for deployment on angrygoats.net # from generators import tokenize from storage import Storage from syllables import Syllables from symbolstate import SymbolState import feedparser import datetime import cPickle import config import sha import web import os def upwrite(generator, to_upwrite): """upwrite a sequence of tokens. to_upwrite is a list of sequences to be upwritten. assumption: all sequences to be upwritten are equal length. """ seq_size = len(to_upwrite[0]) matches = [0] * len(to_upwrite) buffer = [] for token in (t.strip() for t in generator): buffer.append(token) for i, seq in enumerate(to_upwrite): if token == seq[matches[i]]: matches[i] += 1 else: matches[i] = 0 if matches[i] == seq_size: yield [' '.join(to_upwrite[i])] matches = [0] * len(to_upwrite) buffer = [] break longest_match = max(matches) if len(buffer) > longest_match: output_idx = len(buffer)-longest_match output, buffer = buffer[:output_idx], buffer[output_idx:] yield output def remove_token_sequences(generator): for token_seq in generator: for token in token_seq: yield token class Poet3: def __init__(self, site, uri): self.site, self.uri = site, uri self.storage = Storage(self.site, self.uri) self.syllables = Syllables() if self.storage.has_file('upwritten.txt'): self.upwritten_seqs = cPickle.load(self.storage.open('upwritten.txt')) else: self.upwritten_seqs = None self.update() self.build_state() # if there has been no upwriting yet, then let's do some now. # we save the resulting combined tokens, so the code uses those # when adding new tokens to the corpus from now on if not self.upwritten_seqs: to_upwrite = self.symbol_state.chunkable() cPickle.dump(to_upwrite, self.storage.open('upwritten.txt', 'w')) self.upwrite(to_upwrite) def has_data(self): return self.symbol_state.forward_markov.total > 0 def update(self): # to keep traffic down, we'll only check for updates every so often last_update = self.storage.mtime('last_update') if datetime.datetime.now() - last_update < config.update_interval: pass # return self.storage.timestamp('last_update') self.storage.require_files(('articles.txt', 'corpus.txt')) articles = set((t.strip() for t in self.storage.open('articles.txt'))) new_articles = set() # do we have an e-tag or last-modified? etag = last_modified = None if self.storage.has_file('etag.txt'): etag = self.storage.open('etag.txt').readline().strip() elif self.storage.has_file('last-modified.txt'): try: last_modified = cPickle.load(self.storage.open('last-modified.txt')) except: self.storage.unlink('last-modified.txt') fp = feedparser.parse(self.uri, modified=last_modified, etag=etag) if hasattr(fp, "status"): if fp.status == 200: # okay pass elif fp.status == 304: # okay, just not modified since last time; return return elif fp.status == 403: raise Exception("Unable to retrieve RSS feed: permission denied.") elif fp.status == 404: raise Exception("Unable to retrieve RSS feed: not found.") else: raise Exception("Unable to retrieve RSS feed: HTTP error code %d" % (fp.status)) def fp_generator(): for entry in fp['entries']: if hasattr(entry, "guid") and entry.guid: article = sha.new(entry.guid).hexdigest() if article in articles: continue else: articles.add(article) new_articles.add(article) to_tokenize = [] if hasattr(entry, "title"): to_tokenize.append("title") if hasattr(entry, "description"): to_tokenize.append("description") for attr in to_tokenize: for token in tokenize(getattr(entry, attr)): yield token # update last-modified and etag; FIXME what happens if status is 304? if hasattr(fp, "etag") and fp.etag: self.storage.open("etag.txt", "w").write(fp.etag+"\n") else: self.storage.unlink("etag.txt") if hasattr(fp, "modified") and fp.modified: cPickle.dump(fp.modified, self.storage.open("last-modified.txt", "w")) else: self.storage.unlink("last-modified.txt") corpus_fd = self.storage.open('corpus.txt', 'a') generator = fp_generator() if self.upwritten_seqs: generator = upwrite(generator, self.upwritten_seqs) for token in generator: corpus_fd.write(token+"\n") corpus_fd.close() articles_fd = self.storage.open('articles.txt', 'a') for article in new_articles: articles_fd.write(article+"\n") articles_fd.close() def build_state(self): self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt'))) self.symbol_state.update() def upwrite(self, to_upwrite): if len(to_upwrite) == 0: return corpus_fd = self.storage.open('corpus.txt') upwrite_fd = self.storage.open('upwrite.txt', 'w') for token_list in upwrite((t.strip() for t in corpus_fd), to_upwrite): for token in token_list: upwrite_fd.write(token+'\n') upwrite_fd.close() os.rename(self.storage.file('upwrite.txt'), self.storage.file('corpus.txt'))