The unified diff between revisions [aca48eab..] and [6566c817..] is displayed below. It can also be downloaded as a raw diff.

#
#
# add_file "fritz.py"
#  content [28c80476a119ac744f16107031bb10ed60e3e780]
#
# patch "mk2.py"
#  from [5a6ddfdcab6cfff50ea025721abb21ddc65536a1]
#    to [a7028cf9ec6b8a7ed3584edd7c2adc04e5ffbe51]
#
#   set "fritz.py"
#  attr "mtn:execute"
# value "true"
#
============================================================
--- fritz.py	28c80476a119ac744f16107031bb10ed60e3e780
+++ fritz.py	28c80476a119ac744f16107031bb10ed60e3e780
@@ -0,0 +1,53 @@
+#!/usr/bin/env python2.4
+
+from mk2 import MarkovChain, MarkovState
+import random
+random.seed()
+import cPickle
+import sys
+
+def debug_fritzable(chain):
+    def fritzable(s):
+        return len (s.state) > 1
+    print chain.states.values()
+    fritzable_states = filter (fritzable, chain.states.values())
+    print map (lambda s : ''.join (s.state),  fritzable_states)
+
+def fritz(chain, max_bytes):
+    def starting_point():
+        # pick a starting point
+        total = sum (map (lambda s : s.total, chain.states.values ()))
+        choice = random.randint(0, total - 1)
+        state = None
+        for idx, state in enumerate (chain.states.values ()):
+            choice -= state.total
+            if choice <= 0:
+                break
+        return state
+    # walk the markov, output stuff
+    out_bytes = 0
+    state = None
+    # but let's have some cycle detection
+    cycle_counts = {}
+    while out_bytes < max_bytes:
+        if state == None:
+            state = starting_point()
+        if state == None:
+            # we're really out of ideas..
+            break
+        to_out = ''.join (state.state)
+        if out_bytes + len(to_out) >= max_bytes:
+            to_out = to_out[:max_bytes - out_bytes]
+        sys.stdout.write (to_out)
+        out_bytes += len(to_out)
+        cycle_count = cycle_counts[id(state)] = cycle_counts.setdefault(id(state), 0) + 1
+        if cycle_count > 5:
+            state = None
+            cycle_count = {}
+        else:
+            state = chain.random_next (state)
+    sys.stdout.write('\n')
+
+if __name__ == '__main__':
+    chain = cPickle.load (sys.stdin)
+    fritz(chain, 1024)
============================================================
--- mk2.py	5a6ddfdcab6cfff50ea025721abb21ddc65536a1
+++ mk2.py	a7028cf9ec6b8a7ed3584edd7c2adc04e5ffbe51
@@ -1,6 +1,8 @@ import cPickle
 #!/usr/bin/env python

 import cPickle
+import random
+random.seed()
 import heapq
 import math
 import sys
@@ -19,14 +21,15 @@ class MarkovState(object):

     def __entropy(self):
         return -1 * sum(map(lambda p: p * math.log(p, 2),
-                        map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+                        map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+
     def entropy(self):
         if self.h == None:
             self.h = self.__entropy()
         return self.h

     def __repr__(self):
-        return repr(self.state)
+        return repr(self.scores)

     def __cmp__(self, other):
         if other == None:
@@ -35,12 +38,13 @@ class MarkovChain(object):

 class MarkovChain(object):
     def __init__(self, length):
-        self.states = {}
         self.length = length
+        self.clear()

     def update(self, gen):
         buffer = []
         for token in gen:
+            self.stash.append(token)
             if len(buffer) == self.length:
                 tbuffer = tuple(buffer)
                 if self.states.has_key(tbuffer):
@@ -50,23 +54,76 @@ class MarkovChain(object):
                 state.increment(token)
                 buffer = buffer[1:]
             buffer.append(token)
+
+    def clear(self):
+        self.states = {}
+        self.stash = []
+
+    def random_next(self, from_state):
+        def next_state(token):
+            return from_state.state[:-1] + (token,)
+        # eliminate dead-ends
+        def not_dead_end(token):
+            return self.states.has_key (next_state (token))
+        possible = filter (not_dead_end, from_state.scores.keys())
+#        print >>sys.stderr, (from_state, possible)
+        if not possible:
+            return None
+        total = sum (map (lambda s: from_state.scores[s], possible))
+        choice = random.randrange(0, total)
+        for k in possible:
+            total -= from_state.scores[k]
+            if total <= 0:
+                return self.states[next_state(k)]
+        raise Exception("Unreachable")

     def upchunk(self):
+        while True:
+            to_upchunk = self.__select_upchunk()
+            if to_upchunk == None:
+                break
+            stash_copy = self.stash
+            self.clear()
+            self.update(self.__upchunk_gen (stash_copy, to_upchunk))
+            del stash_copy
+
+    def entropy_sort(self):
         q = []
         keys = self.states.keys()
         if len(keys) == 0:
-            return
+            return q
         for idx, tokens in enumerate(keys):
             state = self.states[tokens]
             heapq.heappush(q, state)
+        return q
+
+    def __select_upchunk(self):
+        q = self.entropy_sort ()
         cutoff = math.log (len (keys), 2) / 4
         candidate = heapq.heappop(q)
-        print "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
+        print >>sys.stderr, "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
         if candidate.entropy() < cutoff:
             return None
         else:
             return candidate.state
-
+
+    def __upchunk_gen(self, gen, to_upchunk):
+        buffer = []
+        for i in gen:
+            buffer.append(i)
+            if len(buffer) == len(to_upchunk):
+                if tuple(buffer) == to_upchunk:
+                    buffer = [ ''.join(buffer) ]
+                else:
+                    to_yield, buffer = buffer[0], buffer[1:]
+                    yield to_yield
+        for i in buffer:
+            yield i
+
+    def pprint(self):
+        from pprint import pprint
+        pprint(chain.states)
+
 def simple_gen(fname):
     for line in open(fname, 'rb'):
         for char in line:
@@ -74,41 +131,11 @@ def simple_gen(fname):
 #        for word in line.split():
 #            yield word.lower()

-def remember_gen(gen, remember):
-    for i in gen:
-        remember.append(i)
-        yield i
-
-def upchunk_gen(gen, to_upchunk):
-    buffer = []
-    for i in gen:
-        buffer.append(i)
-        if len(buffer) == len(to_upchunk):
-            if tuple(buffer) == to_upchunk:
-                buffer = [ ''.join(buffer) ]
-            else:
-                to_yield, buffer = buffer[0], buffer[1:]
-                yield to_yield
-    for i in buffer:
-        yield i
-
 if __name__ == '__main__':
     chain = MarkovChain(2)
+    for infile in sys.argv[1:]:
+        print >> sys.stderr, "Reading input file:", infile
+        chain.update(simple_gen (infile))
+    chain.upchunk()
+    print >>sys.stderr, "processing produced", len(chain.states.keys()), "states."
+    cPickle.dump(chain, sys.stdout, protocol=2)
-    if len(sys.argv) <> 3:
-        raise Exception("Usage: %s <file> <outfile.pk>" %sys.argv[0])
-    infile, outfile = sys.argv[1:]
-    # first run through the mills..
-    stash = []
-    chain.update(remember_gen (simple_gen (infile), stash))
-    print "processing produced", len(chain.states.keys()), "states."
-    while True:
-        to_upchunk = chain.upchunk()
-        if to_upchunk == None:
-            break
-        new_stash = []
-        new_chain = MarkovChain(chain.length)
-        new_chain.update(remember_gen (upchunk_gen (stash, to_upchunk), new_stash))
-        stash = new_stash
-        chain = new_chain
-    print "final model formed and has", len(chain.states.keys()), "states."
-    cPickle.dump(chain, open(outfile, 'w'), protocol=2)