The unified diff between revisions [66f54f8e..] and [b74d6135..] is displayed below. It can also be downloaded as a raw diff.
#
#
# add_file "config.py"
# content [3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a]
#
# patch "poet.py"
# from [39afd2783b013423441dbb1a506a451aa8b5a15f]
# to [dc2354ba987aa742b4918906d8913dbb89ffdfb3]
#
============================================================
--- config.py 3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a
+++ config.py 3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a
@@ -0,0 +1,6 @@
+
+import os
+
+install_path = '/Users/grahame/monotone/memes'
+storage_path = os.path.join(install_path, 'storage')
+word_shelf = os.path.join(install_path, 'word.shelf')
============================================================
--- poet.py 39afd2783b013423441dbb1a506a451aa8b5a15f
+++ poet.py dc2354ba987aa742b4918906d8913dbb89ffdfb3
@@ -3,26 +3,60 @@ from dictionary import Word
import generators
from generators import RSSGenerator, HTTPGenerator
from dictionary import Word
+
+import datetime
+import urlparse
+import cPickle
+import profile
+import config
import shelve
+import string
import math
+import gzip
+import sha
+import os
-class WordFactory:
+class SymbolLibrary:
+ """Store symbols and issue them with an ID, which can be used in place of the word."""
def __init__(self):
- self.db = shelve.open('word.shelf')
- self.cache = {}
+ self.word_info_db = None
+ # the index into the following array is the ID
+ self.words = []
+ self.next_id = 0
+ # allows you to go from a symbol to the ID which has the
+ # corresponding Word instance.
+ self.symbol_to_id = {}
+ def __require_db(self):
+ if self.word_info_db == None:
+ self.word_info_db = shelve.open(config.word_shelf)
+ def __close_db(self):
+ if self.word_info_db != None:
+ self.word_info_db.close()
+ self.word_info_db = None
def __lookup_word(self, w):
- if self.db.has_key(w):
- return self.db[w]
- else:
- return Word(w)
- def get(self, w):
- return self.cache.setdefault(w, self.__lookup_word(w))
- def generate(self, g):
+ # assumes that this is a word not yet seen
+ def word_instance():
+ if self.word_info_db.has_key(w):
+ return self.word_info_db[w]
+ else:
+ return Word(w)
+ w = word_instance()
+ self.words.append(w)
+ rv = self.next_id
+ self.next_id += 1
+ return rv
+ def determine_word_from_id(self, id):
+ return self.words[id]
+ def determine_id_for_word(self, word):
+ return self.symbol_to_id.setdefault(word, self.__lookup_word(word))
+ def words_to_id_stream(self, g):
+ self.__require_db()
for s in g:
r = []
for w in s:
- r.append(self.get(w))
+ r.append(self.determine_id_for_word(w))
yield r
+ self.__close_db()
class MarkovScore:
def __init__(self):
@@ -37,11 +71,13 @@ class MarkovModel:
class MarkovModel:
def __init__(self, size):
+ self.total = 0
self.size = size
self.scores = {}
def add(self, tokens, word):
if len(tokens) != self.size:
raise Exception("token list is of incorrect size.")
+ self.total += 1
self.scores.setdefault(tokens, MarkovScore()).add_score(word)
class SymbolState:
@@ -65,28 +101,85 @@ class SymbolState:
buffer = buffer[1:]
buffer.append(word)
def chunk(self, minimum_entropy):
+ max_h = 0
+ rv = None
for token_list in self.forward_markov.scores:
score = self.forward_markov.scores[token_list]
- print score.entries
- print map(lambda x: (score.entries[x] / score.total), score.entries)
- h = -1 * sum(map(lambda p: p * math.log(p, 2),
- map(lambda x: (score.entries[x] / score.total), score.entries)))
- print h
+ total = self.forward_markov.total
+# print token_list, "entries:", score.entries
+ h = -1 * sum(map(lambda p: p * math.log(p, 2),
+ map(lambda x: (score.entries[x] / float(score.total)), score.entries)))
+ if h > max_h:
+ max_h = h
+ rv = token_list
+ return rv
class Document:
def __init__(self):
self.sentences = []
+ self.symbols = SymbolLibrary()
self.update_state()
def add(self, sentences):
- self.sentences += map(lambda x: x, sentences)
+ self.sentences += map(lambda x: x, self.symbols.words_to_id_stream(sentences))
self.update_state()
def update_state(self):
self.symbol_state = SymbolState()
self.symbol_state.update(self.sentences)
- self.symbol_state.chunk(minimum_entropy=0.8)
+class Poet:
+ def __init__(self, document, poet):
+ self.document, self.poet = document, poet
+
+ def haiku(self):
+ to_chunk = document.symbol_state.chunk(minimum_entropy=0.8)
+
+def storage_dir(uri):
+ # list of directories in the storage path to get to this URI
+ dirs = []
+ # just an indication, obviously prone to abuse if someone feeds in a malicious hostname
+ site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(uri)[1])
+ # no ..
+ site = site.lstrip('.')
+ dirs.append(site)
+
+ # but this should be safe
+ hash = sha.new(uri).hexdigest()
+ dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]]
+
+ c_dir = config.storage_path
+ for dir in dirs:
+ c_dir = os.path.join(c_dir, dir)
+ if not os.access(c_dir, os.R_OK):
+ os.mkdir(c_dir)
+
+ uri_file = os.path.join(c_dir, 'uri')
+ if not os.access(uri_file, os.R_OK):
+ open(uri_file, 'w').write(uri + '\n')
+ else:
+ if open(uri_file).read() != uri + '\n':
+ raise Exception("Storage problem: the wrong URI is stored in this directory!")
+ return c_dir
+
+def document_for_uri(uri):
+ dir = storage_dir(uri)
+ doc_file = os.path.join(dir, 'doc.pickle')
+ if not os.access(doc_file, os.R_OK):
+ print "not from cache"
+ doc = Document()
+ else:
+ print "from cache"
+ doc = cPickle.load(open(doc_file))
+
+ if not hasattr(doc, 'last_update') or datetime.datetime.utcnow() - doc.last_update > datetime.timedelta(days=1):
+ # FIXME: cache which articles we've seen
+ doc.add(RSSGenerator(
+ HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss')))
+ doc.last_update = datetime.datetime.utcnow()
+ cPickle.dump(doc, open(doc_file, 'w'))
+
if __name__ == '__main__':
- factory = WordFactory()
- doc = Document()
- doc.add(factory.generate(RSSGenerator(HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss'))))
+ uri = 'http://glamdring.local/~grahame/rss'
+ doc = document_for_uri(uri)
+
+