Below is the file 'symbolstate.py' from this revision. You can also download the file.


import math

class MarkovException(Exception):
    pass

class MarkovScore:
    def __init__(self):
        self.scores = {}
        self.total = 0

    def add_score(self, token):
        self.scores.setdefault(token, 0)
        self.scores[token] += 1
        self.total += 1

    def entropy(self):
        if not hasattr(self, 'h'):
            self.h = -1 * sum(map(lambda p: p * math.log(p, 2),
                              map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
        return self.h

class MarkovModel:
    def __init__(self, size):
        self.total = 0
        self.size = size
        self.scores = {}

    def add(self, tokens, token):
        tokens = tuple(tokens)
        if len(tokens) != self.size:
            raise MarkovException("Token list is of incorrect size.")
        if not self.scores.has_key(tokens):
            self.scores[tokens] = MarkovScore()
        self.scores[tokens].add_score(token)
        self.total += 1


class SymbolState:
    def __init__(self, corpus, chain_size=2):
        self.forward_markov = MarkovModel(chain_size)
        self.corpus = corpus

    def update(self):
        buffer = []
        for token in self.corpus:
            if token == None:
                continue
            if len(buffer) == self.forward_markov.size:
                self.forward_markov.add(buffer, token)
                buffer = buffer[1:]
            buffer.append(token)

    def chunkable(self):
        entropies = [self.forward_markov.scores[t].entropy() for t in self.forward_markov.scores]
        mean_h = sum(entropies) / len(entropies)
        sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies))
        cutoff = mean_h + 6 * sd_h # should really justify in some way other than 'it works'
        rv =  filter(lambda tokens: self.forward_markov.scores[tokens].entropy() > cutoff, self.forward_markov.scores)
        return rv