The unified diff between revisions [47b93972..] and [5502da91..] is displayed below. It can also be downloaded as a raw diff.

#
#
# add_file "mk2.py"
#  content [347465bde5e18766879b3f3207985d278b8f711d]
#
#   set "mk2.py"
#  attr "mtn:execute"
# value "true"
#
============================================================
--- mk2.py	347465bde5e18766879b3f3207985d278b8f711d
+++ mk2.py	347465bde5e18766879b3f3207985d278b8f711d
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+import heapq
+import math
+import sys
+
+class MarkovState(object):
+    def __init__(self, state):
+        self.state = state
+        self.h = None
+        self.total = 0
+        self.scores = {}
+
+    def increment(self, token):
+        self.total += 1
+        self.scores[token] = self.scores.get(token, 0) + 1
+        self.h = None
+
+    def __entropy(self):
+        return -1 * sum(map(lambda p: p * math.log(p, 2),
+                        map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+    def entropy(self):
+        if self.h == None:
+            self.h = self.__entropy()
+        return self.h
+
+    def __cmp__(self, other):
+        if other == None:
+            return -1
+        return cmp(other.entropy(), self.entropy())
+
+class MarkovChain(object):
+    def __init__(self, length):
+        self.states = {}
+        self.length = length
+
+    def update(self, gen):
+        buffer = []
+        for token in gen:
+            if len(buffer) == self.length:
+                tbuffer = tuple(buffer)
+                if self.states.has_key(tbuffer):
+                    state = self.states[tbuffer]
+                else:
+                    state = self.states[tbuffer] = MarkovState(tbuffer)
+                state.increment(token)
+                buffer = buffer[1:]
+            buffer.append(token)
+
+    def upchunk(self):
+        q = []
+        keys = self.states.keys()
+        if len(keys) == 0:
+            return
+        for idx, tokens in enumerate(keys):
+            state = self.states[tokens]
+            heapq.heappush(q, state)
+        cutoff = math.log (len (keys), 2) / 4
+        candidate = heapq.heappop(q)
+        print "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
+        if candidate.entropy() < cutoff:
+            return None
+        else:
+            return candidate.state
+
+def simple_gen(fname):
+    for line in open(fname, 'rb'):
+        for char in line:
+            yield char
+#        for word in line.split():
+#            yield word.lower()
+
+def remember_gen(gen, remember):
+    for i in gen:
+        remember.append(i)
+        yield i
+
+def upchunk_gen(gen, to_upchunk):
+    buffer = []
+    for i in gen:
+        buffer.append(i)
+        if len(buffer) == len(to_upchunk):
+            if tuple(buffer) == to_upchunk:
+                buffer = [ ''.join(buffer) ]
+            else:
+                to_yield, buffer = buffer[0], buffer[1:]
+                yield to_yield
+    for i in buffer:
+        yield i
+
+if __name__ == '__main__':
+    chain = MarkovChain(2)
+    # first run through the mills..
+    stash = []
+    chain.update(remember_gen (simple_gen (sys.argv[1]), stash))
+    print "processing produced", len(chain.states.keys()), "states."
+    while True:
+        to_upchunk = chain.upchunk()
+        if to_upchunk == None:
+            break
+        new_stash = []
+        new_chain = MarkovChain(chain.length)
+        new_chain.update(remember_gen (upchunk_gen (stash, to_upchunk), new_stash))
+        stash = new_stash
+        chain = new_chain
+    print "final model formed and has", len(chain.states.keys()), "states."