Below is the file 'poet.py' from this revision. You can also download the file.
#!/usr/bin/env python import generators from generators import RSSGenerator, HTTPGenerator from dictionary import Word import datetime import urlparse import cPickle import profile import config import random import shelve import string import math import sha import sys import os class SymbolLibrary: """Store symbols and issue them with an ID, which can be used in place of the word.""" def __init__(self): self.word_info_db = None # the index into the following array is the ID self.words = [] self.next_id = 0 # allows you to go from a symbol to the ID which has the # corresponding Word instance. self.symbol_to_id = {} self.open_db() def open_db(self): if self.word_info_db == None: self.word_info_db = shelve.open(config.word_shelf) def close_db(self): if self.word_info_db != None: self.word_info_db.close() self.word_info_db = None def __assign_id_to_word(self, w): # assumes that this is a word not yet seen if self.word_info_db.has_key(w): w = self.word_info_db[w] else: w = Word(w) self.words.append(w) self.symbol_to_id[w], rv = self.next_id, self.next_id self.next_id += 1 return rv def word_from_id(self, id): return self.words[id] def id_for_word(self, word): if self.symbol_to_id.has_key(word): return self.symbol_to_id[word] else: return self.__assign_id_to_word(word) def words_to_id_stream(self, g): for s in g: r = [] for w in s: r.append(self.id_for_word(w)) yield r class MarkovScore: def __init__(self): self.scores = {} self.total = 0 def add_score(self, token): self.scores.setdefault(token, 0) self.scores[token] += 1 self.total += 1 def __repr__(self): return str((self.total, self.scores)) class MarkovModel: def __init__(self, size): self.total = 0 self.size = size self.scores = {} def add(self, tokens, word): if len(tokens) != self.size: raise Exception("token list is of incorrect size.") self.total += 1 self.scores.setdefault(tokens, MarkovScore()).add_score(word) def entropy(self, token_list): score = self.scores[token_list] return -1 * sum(map(lambda p: p * math.log(p, 2), map(lambda x: (score.scores[x] / float(score.total)), score.scores))) class SymbolState: def __init__(self, chain_size=2): self.forward_markov = MarkovModel(chain_size) self.reverse_markov = MarkovModel(chain_size) def update(self, corpus): buffer = [] # for the moment, we shall ignore sentences # as a concept, and just maintain the buffer over them # entropic chunking should add back the structure we need # anyway, and we'll get a much more 'rich' MarkovModel as a # result. for word in corpus: if word == None: continue if len(buffer) == self.forward_markov.size: self.forward_markov.add(tuple(buffer), word) r_buffer = buffer[1:] + [word] self.reverse_markov.add(tuple(r_buffer), buffer[0]) buffer = buffer[1:] buffer.append(word) def cutoff_entropy(self): entropies = [self.forward_markov.entropy(t) for t in self.forward_markov.scores] mean_h = sum(entropies) / len(entropies) sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies)) return mean_h + 8 * sd_h # dodgy; should really justify. def chunk(self, cutoff_h): rv = [] for token_list in self.forward_markov.scores: h = self.forward_markov.entropy(token_list) if h >= cutoff_h: rv.append((h, token_list)) return rv class Document: def __init__(self): self.corpus = [] self.symbols = SymbolLibrary() self.article_ids = set() self.update_state() def add(self, sentences): for sentence in self.symbols.words_to_id_stream(sentences): for word in sentence: self.corpus.append(word) self.update_state() def update_state(self): self.symbol_state = SymbolState() self.symbol_state.update(self.corpus) def upwrite(self, seqs_to_upwrite): "ensure you call update_state() after calling this!" substs = {} for to_upwrite in seqs_to_upwrite: words, syllables = [], 0 for word_id in to_upwrite: word = self.symbols.word_from_id(word_id) words.append(word) syllables += word.syllables new_word = Word(' '.join(words)) new_word.info['syllables'] = syllables new_id = self.symbols.id_for_word(new_word) substs[to_upwrite] = new_id l, u = len(self.corpus), len(to_upwrite) if l < u: return # do the substitution. for efficiency avoid # copying the entire text in memory (hence the # assignment of None to the array values, handled # specially by all code using the corpus) for i in xrange(l - u): non_null = [] # this code takes up 3 seconds!?! for j in xrange(i, l): if self.corpus[j] != None: non_null.append(self.corpus[j]) if len(non_null) == u: break # so if the token layout is A None B # distance = 3 distance = j + 1 - i new_id = substs.get(tuple(non_null)) if new_id != None: self.corpus[i] = new_id for j in xrange(1,distance): self.corpus[i+j] = None def write_ids(self, ids): rv = [] for id in ids: if id == None: continue rv.append(self.symbols.word_from_id(id)) return ' '.join(rv) def write_corpus(self): return self.write_ids(self.corpus) def storage_dir(uri): # list of directories in the storage path to get to this URI dirs = [] # just an indication, obviously prone to abuse if someone feeds in a malicious hostname site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(uri)[1]) # no .. site = site.lstrip('.') dirs.append(site) # but this should be safe hash = sha.new(uri).hexdigest() dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]] c_dir = config.storage_path for dir in dirs: c_dir = os.path.join(c_dir, dir) if not os.access(c_dir, os.R_OK): os.mkdir(c_dir) uri_file = os.path.join(c_dir, 'uri') if not os.access(uri_file, os.R_OK): open(uri_file, 'w').write(uri + '\n') else: if open(uri_file).read() != uri + '\n': raise Exception("Storage problem: the wrong URI is stored in this directory!") return c_dir def document_for_uri(uri): dir = storage_dir(uri) doc_file = os.path.join(dir, 'doc.pickle') if not os.access(doc_file, os.R_OK): doc = Document() else: doc = cPickle.load(open(doc_file)) if not hasattr(doc, 'last_update') or \ datetime.datetime.utcnow() - doc.last_update > datetime.timedelta(days=1): # FIXME: cache which articles we've seen doc.add(RSSGenerator( HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss'), doc.article_ids)) doc.last_update = datetime.datetime.utcnow() # entropic chunking cutoff_h = doc.symbol_state.cutoff_entropy() seqs_to_chunk = doc.symbol_state.chunk(cutoff_h) # for h, seq in seqs_to_chunk: # print "chunking:", doc.write_ids(seq) doc.upwrite((t[1] for t in seqs_to_chunk)) doc.update_state() doc.symbols.close_db() cPickle.dump(doc, open(doc_file, 'w'), protocol=2) doc.symbols.open_db() return doc def haiku(doc, form=[5,7,5]): markov = doc.symbol_state.forward_markov def generate_line(target, state): logger.log("starting line: %s" % target) state = tuple(state) line = [] count = 0 syl = lambda s: doc.symbols.word_from_id(s).syllables def pickfrom(possible, total, get_total): k = random.randint(0, total - 1) for seq in possible: k -= get_total(seq) if k < 0: return seq raise Exception("Failed to pick a number - 'total' miscalcuation?") if len(state) < markov.size: logger.log("start calc sp") # we'll have to pick a starting point # this is the most accurate, but way too slow # #possible = filter(lambda seq: sum(map(syl, seq)) < form[0], markov.scores) #total_possible = sum(map(lambda seq: markov.scores[seq].total, possible)) #seq = pickfrom(possible, total_possible, lambda seq: markov.scores[seq].total) # seq = pickfrom(markov.scores, markov.total, lambda seq: markov.scores[seq].total) count += sum(map(syl, seq)) line += list(seq) state = seq logger.log("end calc sp") elif len(state) > markov.size: state = state[-1*markov.size:] while count < target: maxsize = target - count score = markov.scores[state] # okay, if count + syl(next_token) != target then we need there # to be an entry in the symbolstate for that next potential # symbol. this lets us restrict further and not fall down holes # so often def is_not_deadend(id): next_count = count + syl(id) if next_count == target: return True next_state = (state + (id,))[1:] next_score = markov.scores.get(next_state) return next_score != None possible = set(filter(lambda id: syl(id) <= maxsize and is_not_deadend(id), score.scores)) print target, rv, state, score.scores.keys(), possible total_possible = sum(map(lambda tok: score.scores[tok], possible)) if not possible: break token = pickfrom(possible, total_possible, lambda seq: score.scores[seq]) count += syl(token) state = (state + (token,))[1:] line.append(token) if count != target: logger.log ("count=%d, target=%d :-(" % (count, target)) return None else: return line logger.log("starting haiku generation") rv = [] last_line= [] for length in form: line = generate_line(length, last_line) if not line: return None rv.append(line) last_line = line return rv class Log: def __init__(self): self.start_time = self.last_time = datetime.datetime.utcnow() def log(self, s): n = datetime.datetime.utcnow() print "%s : +%s : %s" % (n - self.start_time, n - self.last_time, s) self.last_time = n logger = Log() if __name__ == '__main__': uri = 'http://glamdring.local/~grahame/rss' logger.log("start") doc = document_for_uri(uri) logger.log("doc loaded") h = haiku(doc) logger.log("haiku done") if not h: sys.exit(1) logger.log("print haiku") for seq in h: print doc.write_ids(seq) logger.log("complete")