Below is the file 'mk2.py' from this revision. You can also download the file.
#!/usr/bin/env python import heapq import math import sys class MarkovState(object): def __init__(self, state): self.state = state self.h = None self.total = 0 self.scores = {} def increment(self, token): self.total += 1 self.scores[token] = self.scores.get(token, 0) + 1 self.h = None def __entropy(self): return -1 * sum(map(lambda p: p * math.log(p, 2), map(lambda x: (self.scores[x] / float(self.total)), self.scores))) def entropy(self): if self.h == None: self.h = self.__entropy() return self.h def __cmp__(self, other): if other == None: return -1 return cmp(other.entropy(), self.entropy()) class MarkovChain(object): def __init__(self, length): self.states = {} self.length = length def update(self, gen): buffer = [] for token in gen: if len(buffer) == self.length: tbuffer = tuple(buffer) if self.states.has_key(tbuffer): state = self.states[tbuffer] else: state = self.states[tbuffer] = MarkovState(tbuffer) state.increment(token) buffer = buffer[1:] buffer.append(token) def upchunk(self): q = [] keys = self.states.keys() if len(keys) == 0: return for idx, tokens in enumerate(keys): state = self.states[tokens] heapq.heappush(q, state) cutoff = math.log (len (keys), 2) / 4 candidate = heapq.heappop(q) print "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff) if candidate.entropy() < cutoff: return None else: return candidate.state def simple_gen(fname): for line in open(fname, 'rb'): for char in line: yield char # for word in line.split(): # yield word.lower() def remember_gen(gen, remember): for i in gen: remember.append(i) yield i def upchunk_gen(gen, to_upchunk): buffer = [] for i in gen: buffer.append(i) if len(buffer) == len(to_upchunk): if tuple(buffer) == to_upchunk: buffer = [ ''.join(buffer) ] else: to_yield, buffer = buffer[0], buffer[1:] yield to_yield for i in buffer: yield i if __name__ == '__main__': chain = MarkovChain(2) # first run through the mills.. stash = [] chain.update(remember_gen (simple_gen (sys.argv[1]), stash)) print "processing produced", len(chain.states.keys()), "states." while True: to_upchunk = chain.upchunk() if to_upchunk == None: break new_stash = [] new_chain = MarkovChain(chain.length) new_chain.update(remember_gen (upchunk_gen (stash, to_upchunk), new_stash)) stash = new_stash chain = new_chain print "final model formed and has", len(chain.states.keys()), "states."