The unified diff between revisions [66f54f8e..] and [b74d6135..] is displayed below. It can also be downloaded as a raw diff.

#
#
# add_file "config.py"
#  content [3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a]
#
# patch "poet.py"
#  from [39afd2783b013423441dbb1a506a451aa8b5a15f]
#    to [dc2354ba987aa742b4918906d8913dbb89ffdfb3]
#
============================================================
--- config.py	3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a
+++ config.py	3c9e18ecf6c0b0698927b01c8a8a22a92d9a096a
@@ -0,0 +1,6 @@
+
+import os
+
+install_path = '/Users/grahame/monotone/memes'
+storage_path = os.path.join(install_path, 'storage')
+word_shelf = os.path.join(install_path, 'word.shelf')
============================================================
--- poet.py	39afd2783b013423441dbb1a506a451aa8b5a15f
+++ poet.py	dc2354ba987aa742b4918906d8913dbb89ffdfb3
@@ -3,26 +3,60 @@ from dictionary import Word
 import generators
 from generators import RSSGenerator, HTTPGenerator
 from dictionary import Word
+
+import datetime
+import urlparse
+import cPickle
+import profile
+import config
 import shelve
+import string
 import math
+import gzip
+import sha
+import os

-class WordFactory:
+class SymbolLibrary:
+    """Store symbols and issue them with an ID, which can be used in place of the word."""
     def __init__(self):
-	self.db = shelve.open('word.shelf')
-	self.cache = {}
+	self.word_info_db = None
+	# the index into the following array is the ID
+	self.words = []
+	self.next_id = 0
+	# allows you to go from a symbol to the ID which has the
+	# corresponding Word instance.
+	self.symbol_to_id = {}
+    def __require_db(self):
+	if self.word_info_db == None:
+	    self.word_info_db = shelve.open(config.word_shelf)
+    def __close_db(self):
+	if self.word_info_db != None:
+	    self.word_info_db.close()
+	    self.word_info_db = None
     def __lookup_word(self, w):
-	if self.db.has_key(w):
-	    return self.db[w]
-	else:
-	    return Word(w)
-    def get(self, w):
-	return self.cache.setdefault(w, self.__lookup_word(w))
-    def generate(self, g):
+	# assumes that this is a word not yet seen
+	def word_instance():
+	    if self.word_info_db.has_key(w):
+		return self.word_info_db[w]
+	    else:
+		return Word(w)
+	w = word_instance()
+	self.words.append(w)
+	rv = self.next_id
+	self.next_id += 1
+	return rv
+    def determine_word_from_id(self, id):
+	return self.words[id]
+    def determine_id_for_word(self, word):
+	return self.symbol_to_id.setdefault(word, self.__lookup_word(word))
+    def words_to_id_stream(self, g):
+	self.__require_db()
 	for s in g:
 	    r = []
 	    for w in s:
-		r.append(self.get(w))
+		r.append(self.determine_id_for_word(w))
 	    yield r
+	self.__close_db()

 class MarkovScore:
     def __init__(self):
@@ -37,11 +71,13 @@ class MarkovModel:

 class MarkovModel:
     def __init__(self, size):
+	self.total = 0
 	self.size = size
 	self.scores = {}
     def add(self, tokens, word):
 	if len(tokens) != self.size:
 	    raise Exception("token list is of incorrect size.")
+	self.total += 1
 	self.scores.setdefault(tokens, MarkovScore()).add_score(word)

 class SymbolState:
@@ -65,28 +101,85 @@ class SymbolState:
 		    buffer = buffer[1:]
 		buffer.append(word)
     def chunk(self, minimum_entropy):
+	max_h = 0
+	rv = None
 	for token_list in self.forward_markov.scores:
 	    score = self.forward_markov.scores[token_list]
-	    print score.entries
-	    print map(lambda x: (score.entries[x] / score.total), score.entries)
-	    h = -1 * sum(map(lambda p: p * math.log(p, 2),
-			     map(lambda x: (score.entries[x] / score.total), score.entries)))
-	    print h
+	    total = self.forward_markov.total
+#	    print token_list, "entries:", score.entries
+	    h = -1 *  sum(map(lambda p: p * math.log(p, 2),
+			      map(lambda x: (score.entries[x] / float(score.total)), score.entries)))
+	    if h > max_h:
+		max_h = h
+		rv = token_list
+	return rv

 class Document:
     def __init__(self):
 	self.sentences = []
+	self.symbols = SymbolLibrary()
 	self.update_state()
     def add(self, sentences):
-	self.sentences += map(lambda x: x, sentences)
+	self.sentences += map(lambda x: x, self.symbols.words_to_id_stream(sentences))
 	self.update_state()
     def update_state(self):
 	self.symbol_state = SymbolState()
 	self.symbol_state.update(self.sentences)
-	self.symbol_state.chunk(minimum_entropy=0.8)

+class Poet:
+    def __init__(self, document, poet):
+	self.document, self.poet = document, poet
+
+    def haiku(self):
+	to_chunk = document.symbol_state.chunk(minimum_entropy=0.8)
+
+def storage_dir(uri):
+    # list of directories in the storage path to get to this URI
+    dirs = []
+    # just an indication, obviously prone to abuse if someone feeds in a malicious hostname
+    site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(uri)[1])
+    # no ..
+    site = site.lstrip('.')
+    dirs.append(site)
+
+    # but this should be safe
+    hash = sha.new(uri).hexdigest()
+    dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]]
+
+    c_dir = config.storage_path
+    for dir in dirs:
+	c_dir = os.path.join(c_dir, dir)
+	if not os.access(c_dir, os.R_OK):
+	    os.mkdir(c_dir)
+
+    uri_file = os.path.join(c_dir, 'uri')
+    if not os.access(uri_file, os.R_OK):
+	open(uri_file, 'w').write(uri + '\n')
+    else:
+	if open(uri_file).read() != uri + '\n':
+	    raise Exception("Storage problem: the wrong URI is stored in this directory!")
+    return c_dir
+
+def document_for_uri(uri):
+    dir = storage_dir(uri)
+    doc_file = os.path.join(dir, 'doc.pickle')
+    if not os.access(doc_file, os.R_OK):
+	print "not from cache"
+	doc = Document()
+    else:
+	print "from cache"
+	doc = cPickle.load(open(doc_file))
+
+    if not hasattr(doc, 'last_update') or datetime.datetime.utcnow() - doc.last_update > datetime.timedelta(days=1):
+	# FIXME: cache which articles we've seen
+	doc.add(RSSGenerator(
+		HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss')))
+	doc.last_update = datetime.datetime.utcnow()
+	cPickle.dump(doc, open(doc_file, 'w'))
+
 if __name__ == '__main__':
-    factory = WordFactory()
-    doc = Document()
-    doc.add(factory.generate(RSSGenerator(HTTPGenerator('localhost', 'http://glamdring.local/~grahame/rss'))))
+    uri = 'http://glamdring.local/~grahame/rss'
+    doc = document_for_uri(uri)

+
+