Below is the file 'poet.py' from this revision. You can also download the file.

#!/usr/bin/env python

import generators
from generators import RSSGenerator, HTTPGenerator
from dictionary import Word

import datetime
import urlparse
import cPickle
import profile
import config
import random
import shelve
import string
import math
import sha
import os

class SymbolLibrary:
    """Store symbols and issue them with an ID, which can be used in place of the word."""
    def __init__(self):
	self.word_info_db = None
	# the index into the following array is the ID
	self.words = []
	self.next_id = 0
	# allows you to go from a symbol to the ID which has the
	# corresponding Word instance.
	self.symbol_to_id = {}
	self.open_db()
    def open_db(self):
	if self.word_info_db == None:
	    self.word_info_db = shelve.open(config.word_shelf)
    def close_db(self):
	if self.word_info_db != None:
	    self.word_info_db.close()
	    self.word_info_db = None
    def __assign_id_to_word(self, w):
	# assumes that this is a word not yet seen
	if self.word_info_db.has_key(w):
	    w = self.word_info_db[w]
	else:
	    w = Word(w)
	self.words.append(w)
	self.symbol_to_id[w], rv = self.next_id, self.next_id
	self.next_id += 1
	return rv
    def word_from_id(self, id):
	return self.words[id]
    def id_for_word(self, word):
	if self.symbol_to_id.has_key(word):
	    return self.symbol_to_id[word]
	else:
	    return self.__assign_id_to_word(word)
    def words_to_id_stream(self, g):
	for s in g:
	    r = []
	    for w in s:
		r.append(self.id_for_word(w))
	    yield r

class MarkovScore:
    def __init__(self):
	self.scores = {}
	self.total = 0
    def add_score(self, token):
	self.scores.setdefault(token, 0)
	self.scores[token] += 1
	self.total += 1
    def __repr__(self):
	return str((self.total, self.scores))

class MarkovModel:
    def __init__(self, size):
	self.total = 0
	self.size = size
	self.scores = {}
    def add(self, tokens, word):
	if len(tokens) != self.size:
	    raise Exception("token list is of incorrect size.")
	self.total += 1
	self.scores.setdefault(tokens, MarkovScore()).add_score(word)
    def entropy(self, token_list):
	score = self.scores[token_list]
	return -1 * sum(map(lambda p: p * math.log(p, 2),
			    map(lambda x: (score.scores[x] / float(score.total)), score.scores)))

class SymbolState:
    def __init__(self, chain_size=2):
	self.forward_markov = MarkovModel(chain_size)
	self.reverse_markov = MarkovModel(chain_size)
    def update(self, corpus):
	buffer = []
	# for the moment, we shall ignore sentences
	# as a concept, and just maintain the buffer over them
	# entropic chunking should add back the structure we need
	# anyway, and we'll get a much more 'rich' MarkovModel as a
	# result.
	for word in corpus:
	    if word == None: continue
	    if len(buffer) == self.forward_markov.size:
		self.forward_markov.add(tuple(buffer), word)
		r_buffer = buffer[1:] + [word]
		self.reverse_markov.add(tuple(r_buffer), buffer[0])
		buffer = buffer[1:]
	    buffer.append(word)
    def cutoff_entropy(self):
	entropies = [self.forward_markov.entropy(t) for t in self.forward_markov.scores]
	mean_h = sum(entropies) / len(entropies)
	sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies))
	return mean_h + 8 * sd_h # dodgy; should really justify.
    def chunk(self, cutoff_h):
	rv = []
	for token_list in self.forward_markov.scores:
	    h = self.forward_markov.entropy(token_list)
	    if h >= cutoff_h:
		rv.append((h, token_list))
	return rv

class Document:
    def __init__(self):
	self.corpus = []
	self.symbols = SymbolLibrary()
	self.article_ids = set()
	self.update_state()
    def add(self, sentences):
	for sentence in self.symbols.words_to_id_stream(sentences):
	    for word in sentence:
		self.corpus.append(word)
	self.update_state()
    def update_state(self):
	self.symbol_state = SymbolState()
	self.symbol_state.update(self.corpus)
    def upwrite(self, seqs_to_upwrite):
	"ensure you call update_state() after calling this!"
	substs = {}
	for to_upwrite in seqs_to_upwrite:
	    words, syllables = [], 0
	    for word_id in to_upwrite:
		word = self.symbols.word_from_id(word_id)
		words.append(word)
		syllables += word.syllables
	    new_word = Word(' '.join(words))
	    new_word.info['syllables'] = syllables
	    new_id = self.symbols.id_for_word(new_word)
	    substs[to_upwrite] = new_id

	l, u = len(self.corpus), len(to_upwrite)
	if l < u:
	    return

	# do the substitution. for efficiency avoid
	# copying the entire text in memory (hence the
	# assignment of None to the array values, handled
	# specially by all code using the corpus)
	for i in xrange(l - u):
	    non_null = []
	    # this code takes up 3 seconds!?!
	    for j in xrange(i, l):
		if self.corpus[j] != None:
		    non_null.append(self.corpus[j])
		    if len(non_null) == u: break

	    # so if the token layout is A None B
	    # distance = 3
	    distance = j + 1 - i
	    new_id = substs.get(tuple(non_null))
	    if new_id != None:
		self.corpus[i] = new_id
		for j in xrange(1,distance):
		    self.corpus[i+j] = None
    def write_ids(self, ids):
	rv = []
	for id in ids:
	    if id == None: continue
	    rv.append(self.symbols.word_from_id(id))
	return ' '.join(rv)
    def write_corpus(self):
	return self.write_ids(self.corpus)

def storage_dir(uri):
    # list of directories in the storage path to get to this URI
    dirs = []
    # just an indication, obviously prone to abuse if someone feeds in a malicious hostname
    site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(uri)[1])
    # no ..
    site = site.lstrip('.')
    dirs.append(site)

    # but this should be safe
    hash = sha.new(uri).hexdigest()
    dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]]

    c_dir = config.storage_path
    for dir in dirs:
	c_dir = os.path.join(c_dir, dir)
	if not os.access(c_dir, os.R_OK):
	    os.mkdir(c_dir)

    uri_file = os.path.join(c_dir, 'uri')
    if not os.access(uri_file, os.R_OK):
	open(uri_file, 'w').write(uri + '\n')
    else:
	if open(uri_file).read() != uri + '\n':
	    raise Exception("Storage problem: the wrong URI is stored in this directory!")
    return c_dir

def document_for_uri(uri):
    dir = storage_dir(uri)
    doc_file = os.path.join(dir, 'doc.pickle')
    if not os.access(doc_file, os.R_OK):
	doc = Document()
    else:
	doc = cPickle.load(open(doc_file))

    if not hasattr(doc, 'last_update') or \
	    datetime.datetime.utcnow() - doc.last_update > datetime.timedelta(days=1):
	# FIXME: cache which articles we've seen
	doc.add(RSSGenerator(
		HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss'),
		doc.article_ids))
	doc.last_update = datetime.datetime.utcnow()
	# entropic chunking
	cutoff_h = doc.symbol_state.cutoff_entropy()
	seqs_to_chunk = doc.symbol_state.chunk(cutoff_h)
#	for h, seq in seqs_to_chunk:
#	    print "chunking:", doc.write_ids(seq)
	doc.upwrite((t[1] for t in seqs_to_chunk))
	doc.update_state()
	doc.symbols.close_db()
	cPickle.dump(doc, open(doc_file, 'w'), protocol=2)

    doc.symbols.open_db()
    return doc

def haiku(doc, form=[5,7,5]):
    markov = doc.symbol_state.forward_markov

    def generate_line(target, state):
	state = tuple(state)
	line = []
	count = 0
	syl = lambda s: doc.symbols.word_from_id(s).syllables

	def pickfrom(possible, total, get_total):
	    k = random.randint(0, total - 1)
	    for seq in possible:
		k -= get_total(seq)
		if k < 0:
		    return seq
	    raise Exception("Failed to pick a number - 'total' miscalcuation?")

	if len(state) < markov.size:
	    # we'll have to pick a starting point
	    possible = filter(lambda seq: sum(map(syl, seq)) < form[0], markov.scores)
	    total_possible = sum(map(lambda seq: markov.scores[seq].total, possible))
	    seq = pickfrom(possible, total_possible, lambda seq: markov.scores[seq].total)
	    count += sum(map(syl, seq))
	    line += list(seq)
	    state = seq
	elif len(state) > markov.size:
	    state = state[-1*markov.size:]

	while count < target:
	    maxsize = target - count
	    score = markov.scores[state]
	    possible = filter(lambda id: syl(id) <= maxsize, score.scores)
	    total_possible = sum(map(lambda tok: score.scores[tok], possible))
	    if not possible:
		break
	    token = pickfrom(possible, total_possible, lambda seq: score.scores[seq])
	    count += syl(token)
	    state = (state + (token,))[1:]
	    line.append(token)
	return line

    rv = []
    last_line= []
    for length in form:
	line = generate_line(length, last_line)
	if not line:
	    return None
	rv.append(line)
	last_line = line
    return rv

if __name__ == '__main__':
    uri = 'http://glamdring.local/~grahame/rss'
    doc = document_for_uri(uri)
    h = haiku(doc)
    if not h:
	sys.exit(1)
    print "Woo, haiku is:"
    for seq in h:
	print doc.write_ids(seq)