The unified diff between revisions [aca48eab..] and [6566c817..] is displayed below. It can also be downloaded as a raw diff.
#
#
# add_file "fritz.py"
# content [28c80476a119ac744f16107031bb10ed60e3e780]
#
# patch "mk2.py"
# from [5a6ddfdcab6cfff50ea025721abb21ddc65536a1]
# to [a7028cf9ec6b8a7ed3584edd7c2adc04e5ffbe51]
#
# set "fritz.py"
# attr "mtn:execute"
# value "true"
#
============================================================
--- fritz.py 28c80476a119ac744f16107031bb10ed60e3e780
+++ fritz.py 28c80476a119ac744f16107031bb10ed60e3e780
@@ -0,0 +1,53 @@
+#!/usr/bin/env python2.4
+
+from mk2 import MarkovChain, MarkovState
+import random
+random.seed()
+import cPickle
+import sys
+
+def debug_fritzable(chain):
+ def fritzable(s):
+ return len (s.state) > 1
+ print chain.states.values()
+ fritzable_states = filter (fritzable, chain.states.values())
+ print map (lambda s : ''.join (s.state), fritzable_states)
+
+def fritz(chain, max_bytes):
+ def starting_point():
+ # pick a starting point
+ total = sum (map (lambda s : s.total, chain.states.values ()))
+ choice = random.randint(0, total - 1)
+ state = None
+ for idx, state in enumerate (chain.states.values ()):
+ choice -= state.total
+ if choice <= 0:
+ break
+ return state
+ # walk the markov, output stuff
+ out_bytes = 0
+ state = None
+ # but let's have some cycle detection
+ cycle_counts = {}
+ while out_bytes < max_bytes:
+ if state == None:
+ state = starting_point()
+ if state == None:
+ # we're really out of ideas..
+ break
+ to_out = ''.join (state.state)
+ if out_bytes + len(to_out) >= max_bytes:
+ to_out = to_out[:max_bytes - out_bytes]
+ sys.stdout.write (to_out)
+ out_bytes += len(to_out)
+ cycle_count = cycle_counts[id(state)] = cycle_counts.setdefault(id(state), 0) + 1
+ if cycle_count > 5:
+ state = None
+ cycle_count = {}
+ else:
+ state = chain.random_next (state)
+ sys.stdout.write('\n')
+
+if __name__ == '__main__':
+ chain = cPickle.load (sys.stdin)
+ fritz(chain, 1024)
============================================================
--- mk2.py 5a6ddfdcab6cfff50ea025721abb21ddc65536a1
+++ mk2.py a7028cf9ec6b8a7ed3584edd7c2adc04e5ffbe51
@@ -1,6 +1,8 @@ import cPickle
#!/usr/bin/env python
import cPickle
+import random
+random.seed()
import heapq
import math
import sys
@@ -19,14 +21,15 @@ class MarkovState(object):
def __entropy(self):
return -1 * sum(map(lambda p: p * math.log(p, 2),
- map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+ map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+
def entropy(self):
if self.h == None:
self.h = self.__entropy()
return self.h
def __repr__(self):
- return repr(self.state)
+ return repr(self.scores)
def __cmp__(self, other):
if other == None:
@@ -35,12 +38,13 @@ class MarkovChain(object):
class MarkovChain(object):
def __init__(self, length):
- self.states = {}
self.length = length
+ self.clear()
def update(self, gen):
buffer = []
for token in gen:
+ self.stash.append(token)
if len(buffer) == self.length:
tbuffer = tuple(buffer)
if self.states.has_key(tbuffer):
@@ -50,23 +54,76 @@ class MarkovChain(object):
state.increment(token)
buffer = buffer[1:]
buffer.append(token)
+
+ def clear(self):
+ self.states = {}
+ self.stash = []
+
+ def random_next(self, from_state):
+ def next_state(token):
+ return from_state.state[:-1] + (token,)
+ # eliminate dead-ends
+ def not_dead_end(token):
+ return self.states.has_key (next_state (token))
+ possible = filter (not_dead_end, from_state.scores.keys())
+# print >>sys.stderr, (from_state, possible)
+ if not possible:
+ return None
+ total = sum (map (lambda s: from_state.scores[s], possible))
+ choice = random.randrange(0, total)
+ for k in possible:
+ total -= from_state.scores[k]
+ if total <= 0:
+ return self.states[next_state(k)]
+ raise Exception("Unreachable")
def upchunk(self):
+ while True:
+ to_upchunk = self.__select_upchunk()
+ if to_upchunk == None:
+ break
+ stash_copy = self.stash
+ self.clear()
+ self.update(self.__upchunk_gen (stash_copy, to_upchunk))
+ del stash_copy
+
+ def entropy_sort(self):
q = []
keys = self.states.keys()
if len(keys) == 0:
- return
+ return q
for idx, tokens in enumerate(keys):
state = self.states[tokens]
heapq.heappush(q, state)
+ return q
+
+ def __select_upchunk(self):
+ q = self.entropy_sort ()
cutoff = math.log (len (keys), 2) / 4
candidate = heapq.heappop(q)
- print "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
+ print >>sys.stderr, "best entropy vs. cutoff is: %s :: %.2f vs. cutoff %.2f" % (candidate.state, candidate.entropy(), cutoff)
if candidate.entropy() < cutoff:
return None
else:
return candidate.state
-
+
+ def __upchunk_gen(self, gen, to_upchunk):
+ buffer = []
+ for i in gen:
+ buffer.append(i)
+ if len(buffer) == len(to_upchunk):
+ if tuple(buffer) == to_upchunk:
+ buffer = [ ''.join(buffer) ]
+ else:
+ to_yield, buffer = buffer[0], buffer[1:]
+ yield to_yield
+ for i in buffer:
+ yield i
+
+ def pprint(self):
+ from pprint import pprint
+ pprint(chain.states)
+
def simple_gen(fname):
for line in open(fname, 'rb'):
for char in line:
@@ -74,41 +131,11 @@ def simple_gen(fname):
# for word in line.split():
# yield word.lower()
-def remember_gen(gen, remember):
- for i in gen:
- remember.append(i)
- yield i
-
-def upchunk_gen(gen, to_upchunk):
- buffer = []
- for i in gen:
- buffer.append(i)
- if len(buffer) == len(to_upchunk):
- if tuple(buffer) == to_upchunk:
- buffer = [ ''.join(buffer) ]
- else:
- to_yield, buffer = buffer[0], buffer[1:]
- yield to_yield
- for i in buffer:
- yield i
-
if __name__ == '__main__':
chain = MarkovChain(2)
+ for infile in sys.argv[1:]:
+ print >> sys.stderr, "Reading input file:", infile
+ chain.update(simple_gen (infile))
+ chain.upchunk()
+ print >>sys.stderr, "processing produced", len(chain.states.keys()), "states."
+ cPickle.dump(chain, sys.stdout, protocol=2)
- if len(sys.argv) <> 3:
- raise Exception("Usage: %s <file> <outfile.pk>" %sys.argv[0])
- infile, outfile = sys.argv[1:]
- # first run through the mills..
- stash = []
- chain.update(remember_gen (simple_gen (infile), stash))
- print "processing produced", len(chain.states.keys()), "states."
- while True:
- to_upchunk = chain.upchunk()
- if to_upchunk == None:
- break
- new_stash = []
- new_chain = MarkovChain(chain.length)
- new_chain.update(remember_gen (upchunk_gen (stash, to_upchunk), new_stash))
- stash = new_stash
- chain = new_chain
- print "final model formed and has", len(chain.states.keys()), "states."
- cPickle.dump(chain, open(outfile, 'w'), protocol=2)