Below is the file 'symbolstate.py' from this revision. You can also download the file.
import math class MarkovException(Exception): pass class MarkovScore: def __init__(self): self.scores = {} self.total = 0 def add_score(self, token): self.scores.setdefault(token, 0) self.scores[token] += 1 self.total += 1 def entropy(self): if not hasattr(self, 'h'): self.h = -1 * sum(map(lambda p: p * math.log(p, 2), map(lambda x: (self.scores[x] / float(self.total)), self.scores))) return self.h class MarkovModel: def __init__(self, size): self.total = 0 self.size = size self.scores = {} def add(self, tokens, token): tokens = tuple(tokens) if len(tokens) != self.size: raise MarkovException("Token list is of incorrect size.") if not self.scores.has_key(tokens): self.scores[tokens] = MarkovScore() self.scores[tokens].add_score(token) self.total += 1 class SymbolState: def __init__(self, corpus, chain_size=2): self.forward_markov = MarkovModel(chain_size) self.corpus = corpus def update(self): buffer = [] for token in self.corpus: if token == None: continue if len(buffer) == self.forward_markov.size: self.forward_markov.add(buffer, token) buffer = buffer[1:] buffer.append(token) def chunkable(self): entropies = [self.forward_markov.scores[t].entropy() for t in self.forward_markov.scores] if len(entropies) == 0: return [] mean_h = sum(entropies) / len(entropies) sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies)) cutoff = mean_h + 6 * sd_h # should really justify in some way other than 'it works' rv = filter(lambda tokens: self.forward_markov.scores[tokens].entropy() > cutoff, self.forward_markov.scores) return rv