Below is the file 'mk2.py' from this revision. You can also download the file.

#!/usr/bin/env python

import heapq
import math
import sys

class MarkovState(object):
    def __init__(self, state):
        self.state = state
        self.h = None
        self.total = 0
        self.scores = {}

    def increment(self, token):
        self.total += 1
        self.scores[token] = self.scores.get(token, 0) + 1
        self.h = None

    def __entropy(self):
        return -1 * sum(map(lambda p: p * math.log(p, 2),
                        map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
    def entropy(self):
        if self.h == None:
            self.h = self.__entropy()
        return self.h

    def __cmp__(self, other):
        if other == None:
            return -1
        return cmp(other.entropy(), self.entropy())

class MarkovChain(object):
    def __init__(self, length):
        self.states = {}
        self.length = length

    def update(self, gen):
        buffer = []
        for token in gen:
            if len(buffer) == self.length:
                tbuffer = tuple(buffer)
                if self.states.has_key(tbuffer):
                    state = self.states[tbuffer]
                else:
                    state = self.states[tbuffer] = MarkovState(tbuffer)
                state.increment(token)
                buffer = buffer[1:]
            buffer.append(token)

    def upchunk(self):
        q = []
        keys = self.states.keys()
        if len(keys) == 0:
            return
        for idx, tokens in enumerate(keys):
            state = self.states[tokens]
            heapq.heappush(q, state)
        cutoff = math.log (len (keys), 2) / 4
        candidate = heapq.heappop(q)
        print "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
        if candidate.entropy() < cutoff:
            return None
        else:
            return candidate.state

def simple_gen(fname):
    for line in open(fname, 'rb'):
        for char in line:
            yield char
#        for word in line.split():
#            yield word.lower()

def remember_gen(gen, remember):
    for i in gen:
        remember.append(i)
        yield i

def upchunk_gen(gen, to_upchunk):
    buffer = []
    for i in gen:
        buffer.append(i)
        if len(buffer) == len(to_upchunk):
            if tuple(buffer) == to_upchunk:
                buffer = [ ''.join(buffer) ]
            else:
                to_yield, buffer = buffer[0], buffer[1:]
                yield to_yield
    for i in buffer:
        yield i

if __name__ == '__main__':
    chain = MarkovChain(2)
    # first run through the mills..
    stash = []
    chain.update(remember_gen (simple_gen (sys.argv[1]), stash))
    print "processing produced", len(chain.states.keys()), "states."
    while True:
        to_upchunk = chain.upchunk()
        if to_upchunk == None:
            break
        new_stash = []
        new_chain = MarkovChain(chain.length)
        new_chain.update(remember_gen (upchunk_gen (stash, to_upchunk), new_stash))
        stash = new_stash
        chain = new_chain
    print "final model formed and has", len(chain.states.keys()), "states."