The unified diff between revisions [092dff1e..] and [0eaf66b5..] is displayed below. It can also be downloaded as a raw diff.

#
#
# add_file "forms.py"
#  content [b631b2a653d8c57334c492d8e87cd9296ac07fa4]
#
# add_file "poet3.py"
#  content [3c5adc98809bbd5a6df624911190c02c311f2057]
#
# add_file "storage.py"
#  content [bff5648f025b275e05135246f0c56236953eaef4]
#
# add_file "syllables.py"
#  content [1ddbe0eb3425f3b5cc513e8af525d583ac771aff]
#
# add_file "symbolstate.py"
#  content [83408e4998ed66268e39b916225c46e89b976cdb]
#
# add_file "test.py"
#  content [65bdbf5e10acc674375486e7889981090bd9255b]
#
# patch "config.py"
#  from [2f4bcd09716fcbb8fed88991cb35d2c009e34cfb]
#    to [1028ab910fddc15e627d9e16fcd47d75bd0954ec]
#
# patch "generators.py"
#  from [1f4da17464aa9497077d16c27ccb84e60453057d]
#    to [550b478b5c306ae773cd42e8e5aff775cb0b8aed]
#
#   set "poet3.py"
#  attr "mtn:execute"
# value "true"
#
============================================================
--- forms.py	b631b2a653d8c57334c492d8e87cd9296ac07fa4
+++ forms.py	b631b2a653d8c57334c492d8e87cd9296ac07fa4
@@ -0,0 +1,84 @@
+
+import random
+import config
+
+def haiku(doc, form=[5,7,5]):
+    markov = doc.symbol_state.forward_markov
+
+    def pickfrom(possible, total, get_total):
+	k = random.randint(0, total - 1)
+	for seq in possible:
+	    k -= get_total(seq)
+	    if k < 0:
+		return seq
+	raise Exception("Failed to pick a number - 'total' miscalcuation? (%d, %d)" % (k, total))
+
+    def generate_line(target, state):
+	state = tuple(state)
+	line = []
+	count = 0
+	syl = doc.syllables.lookup
+
+	if len(state) < markov.size:
+	    # we'll have to pick a starting point
+
+	    # this is the most accurate, but way too slow
+	    #
+	    #possible = filter(lambda seq: sum(map(syl, seq)) < form[0], markov.scores)
+	    #total_possible = sum(map(lambda seq: markov.scores[seq].total, possible))
+	    #seq = pickfrom(possible, total_possible, lambda seq: markov.scores[seq].total)
+	    #
+	    seq = pickfrom(markov.scores, markov.total, lambda seq: markov.scores[seq].total)
+	    count += sum(map(syl, seq))
+	    line += list(seq)
+	    state = seq
+	elif len(state) > markov.size:
+	    state = state[-1*markov.size:]
+
+	while count < target:
+	    maxsize = target - count
+	    score = markov.scores[state]
+
+	    # okay, if count + syl(next_token) != target then we need there
+	    # to be an entry in the symbolstate for that next potential
+	    # symbol. this lets us restrict further and not fall down holes
+	    # so often
+	    def is_not_deadend(id):
+		next_count = count + syl(id)
+		if next_count == target: return True
+		next_state = (state + (id,))[1:]
+		next_score = markov.scores.get(next_state)
+		return next_score != None
+
+	    possible = set(filter(lambda id: syl(id) <= maxsize and is_not_deadend(id), score.scores))
+	    print "status:", target, state, score.scores.keys(), possible
+	    total_possible = sum(map(lambda tok: score.scores[tok], possible))
+	    if not possible:
+		break
+
+	    token = pickfrom(possible, total_possible, lambda seq: score.scores[seq])
+	    count += syl(token)
+	    state = (state + (token,))[1:]
+	    line.append(token)
+
+	if count != target:
+	    return None
+	else:
+	    return line
+
+    rv = []
+    last_line= []
+    for length in form:
+	print "** target length is:", length
+
+	for i in xrange(config.haiku_line_attempts):
+	    line = generate_line(length, last_line)
+	    if line != None:
+		break
+
+	print "** resulting line is:", line
+	if not line:
+	    return None
+	rv.append(line)
+	last_line = line
+    return rv
============================================================
--- poet3.py	3c5adc98809bbd5a6df624911190c02c311f2057
+++ poet3.py	3c5adc98809bbd5a6df624911190c02c311f2057
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+#
+# take poet.py and make it fast / sensible,
+# ready for deployment on angrygoats.net
+#
+
+from forms import haiku
+from generators import RSSGenerator, HTTPGenerator
+from storage import Storage
+from syllables import Syllables
+from symbolstate import SymbolState
+import os
+import config
+
+class Poet3:
+    def __init__(self, uri):
+	self.uri = uri
+	self.storage = Storage(self.uri)
+	self.syllables = Syllables()
+	self.upwritten = self.storage.has_file('has_been_upwritten.txt')
+
+    def update(self):
+	self.storage.require_files(('articles.txt', 'corpus.txt'))
+
+	articles, new_articles = set(), set()
+	for line in (t.strip() for t in self.storage.open('articles.txt')):
+	    articles.add(line)
+
+	corpus_fd = self.storage.open('corpus.txt', 'a')
+	for token_seq in RSSGenerator(HTTPGenerator(self.uri), articles, new_articles):
+	    for token in token_seq:
+		corpus_fd.write(token+"\n")
+	corpus_fd.close()
+
+	articles_fd = self.storage.open('articles.txt', 'a')
+	for article in new_articles:
+	    articles_fd.write(article+"\n")
+	articles_fd.close()
+
+    def build_state(self):
+	self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt')))
+	self.symbol_state.update()
+
+    def upwrite(self, to_upwrite):
+	if len(to_upwrite) == 0:
+	    return
+
+	corpus_fd = self.storage.open('corpus.txt')
+	upwrite_fd = self.storage.open('upwrite.txt', 'w')
+
+	def get_token_lists():
+	    matches = [0] * len(to_upwrite)
+	    buffer = []
+	    for token in (t.strip() for t in corpus_fd):
+		buffer.append(token)
+		for i, seq in enumerate(to_upwrite):
+		    if token == seq[matches[i]]:
+			matches[i] += 1
+		    else:
+			matches[i] = 0
+
+		    if matches[i] == self.symbol_state.forward_markov.size:
+			yield [' '.join(to_upwrite[i])]
+			matches = [0] * len(to_upwrite)
+			buffer = []
+			break
+
+		longest_match = max(matches)
+		if len(buffer) > longest_match:
+		    output_idx = len(buffer)-longest_match
+		    output, buffer = buffer[:output_idx], buffer[output_idx:]
+		    yield output
+
+	for token_list in get_token_lists():
+	    for token in token_list:
+		upwrite_fd.write(token+'\n')
+	upwrite_fd.close()
+	os.rename(self.storage.file('upwrite.txt'),
+		  self.storage.file('corpus.txt'))
+
+	self.storage.open('has_been_upwritten.txt', 'w')
+
+def gen_haiku(uri):
+    poet = Poet3(uri)
+    poet.update()
+    poet.build_state()
+    if not poet.upwritten:
+	to_upwrite = poet.symbol_state.chunkable()
+	poet.upwrite(to_upwrite)
+
+    for attempt in xrange(config.haiku_attempts):
+	poem = haiku(poet)
+	if poem != None:
+	    break
+
+    if poem == None:
+	raise PoetException("Could not generate you a poem!")
+
+    return attempt, poem
+
+if __name__ == '__main__':
+    uri = 'http://glamdring.local/~grahame/rss'
+    attempts, poem = gen_haiku(uri)
+    print "%d attempts" % attempts
+    for line in poem:
+	print ' '.join(line)
+
+
+
============================================================
--- storage.py	bff5648f025b275e05135246f0c56236953eaef4
+++ storage.py	bff5648f025b275e05135246f0c56236953eaef4
@@ -0,0 +1,52 @@
+
+import urlparse
+import string
+import sha
+import os
+
+import config
+
+class StorageException(Exception):
+    pass
+
+class Storage:
+    def __init__(self, uri):
+	self.uri = uri
+	self.dir = self.__storage_dir()
+
+    def __storage_dir(self):
+    	# list of directories in the storage path to get to this URI
+	dirs = []
+	site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(self.uri)[1])
+	# no .. entries to climb the filesystem :-)
+	site = site.lstrip('.')
+	dirs.append(site)
+
+	# but this should be safe
+	hash = sha.new(self.uri).hexdigest()
+	dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]]
+
+	c_dir = config.storage_path
+	for dir in dirs:
+	    c_dir = os.path.join(c_dir, dir)
+	    if not os.access(c_dir, os.R_OK):
+		os.mkdir(c_dir)
+	return c_dir
+
+    def file(self, fname):
+	if fname.startswith('/'):
+	    raise StorageException("fname may not start with a slash.")
+	return os.path.join(self.dir, fname)
+
+    def has_file(self, fname):
+	return os.access(self.file(fname), os.R_OK)
+
+    def require_files(self, files):
+	for file in files:
+	    file = self.file(file)
+	    if not os.access(file, os.R_OK):
+		open(file, 'w')
+
+    def open(self, *args):
+	fname, other_args = args[0], args[1:]
+	return open(*[self.file(fname)] + list(other_args))
============================================================
--- syllables.py	1ddbe0eb3425f3b5cc513e8af525d583ac771aff
+++ syllables.py	1ddbe0eb3425f3b5cc513e8af525d583ac771aff
@@ -0,0 +1,31 @@
+
+class Syllables:
+    def __init__(self):
+	self.cache = {}
+
+    def lookup(self, token):
+	if not self.cache.has_key(token):
+	    self.cache[token] = self.__syllable_estimate(token)
+	return self.cache[token]
+
+    def __syllable_estimate(self, token):
+	"Last resort syllable counter. Reasonably accurate in English." \
+	"Allegedly works for French."
+	vowels = ['a', 'e', 'i', 'o', 'u', 'y', "'"]
+	l = None
+	count = 0
+	if len(token) == 0:
+	    return 0
+	if len(token) <= 3:
+	    return 1
+	for c in token:
+	    if c in vowels and l not in vowels:
+		count = count + 1
+	    l = c
+	if count > 1 and ((token[-1] == 'e' and token[-2] != 'l') or
+			  (token[-2] == 'e' and token[-1] == 's')):
+	    # silent 'e'
+	    count = count - 1
+	if count == 0: count = 1
+	return count
+
============================================================
--- symbolstate.py	83408e4998ed66268e39b916225c46e89b976cdb
+++ symbolstate.py	83408e4998ed66268e39b916225c46e89b976cdb
@@ -0,0 +1,60 @@
+
+import math
+
+class MarkovException(Exception):
+    pass
+
+class MarkovScore:
+    def __init__(self):
+	self.scores = {}
+	self.total = 0
+
+    def add_score(self, token):
+	self.scores.setdefault(token, 0)
+	self.scores[token] += 1
+	self.total += 1
+
+    def entropy(self):
+	if not hasattr(self, 'h'):
+	    self.h = -1 * sum(map(lambda p: p * math.log(p, 2),
+				  map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+	return self.h
+
+class MarkovModel:
+    def __init__(self, size):
+	self.total = 0
+	self.size = size
+	self.scores = {}
+
+    def add(self, tokens, token):
+	tokens = tuple(tokens)
+	if len(tokens) != self.size:
+	    raise MarkovException("Token list is of incorrect size.")
+	if not self.scores.has_key(tokens):
+	    self.scores[tokens] = MarkovScore()
+	self.scores[tokens].add_score(token)
+	self.total += 1
+
+
+class SymbolState:
+    def __init__(self, corpus, chain_size=2):
+	self.forward_markov = MarkovModel(chain_size)
+	self.corpus = corpus
+
+    def update(self):
+	buffer = []
+	for token in self.corpus:
+	    if token == None:
+		continue
+	    if len(buffer) == self.forward_markov.size:
+		self.forward_markov.add(buffer, token)
+		buffer = buffer[1:]
+	    buffer.append(token)
+
+    def chunkable(self):
+	entropies = [self.forward_markov.scores[t].entropy() for t in self.forward_markov.scores]
+	mean_h = sum(entropies) / len(entropies)
+	sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies))
+	cutoff = mean_h + 8 * sd_h # should really justify in some way other than 'it works'
+	return filter(lambda tokens: self.forward_markov.scores[tokens].entropy() > cutoff,
+		      self.forward_markov.scores)
============================================================
--- test.py	65bdbf5e10acc674375486e7889981090bd9255b
+++ test.py	65bdbf5e10acc674375486e7889981090bd9255b
@@ -0,0 +1,3 @@
+#!/usr/bin/python
+
+pass
============================================================
--- config.py	2f4bcd09716fcbb8fed88991cb35d2c009e34cfb
+++ config.py	1028ab910fddc15e627d9e16fcd47d75bd0954ec
@@ -1,7 +1,12 @@ import os

 import os

-install_path = '/Users/grahame/monotone/memes'
+install_path = '/Users/grahame/monotone/memes/'
 storage_path = os.path.join(install_path, 'storage')
 word_shelf = os.path.join(install_path, 'word.shelf')
 word_db = os.path.join(install_path, 'word.db')
+
+user_schema = os.path.join(install_path, 'sql', 'user.sql')
+
+haiku_line_attempts = 3
+haiku_attempts = 3
============================================================
--- generators.py	1f4da17464aa9497077d16c27ccb84e60453057d
+++ generators.py	550b478b5c306ae773cd42e8e5aff775cb0b8aed
@@ -4,6 +4,7 @@ import libxml2
 import sha
 import socket
 import libxml2
+import urlparse

 s_splitter = re.compile(r'[\.\!]+')
 w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
@@ -48,9 +49,10 @@ class Callback:
 	return rv

 class Callback:
-	def __init__(self, queue, article_ids):
+	def __init__(self, queue, article_ids, new_article_ids):
 		self.queue = queue
 		self.article_ids = article_ids
+		self.new_article_ids = new_article_ids
 	def startDocument(self):
 		self.gather_chars = False
 		self.in_item = False
@@ -61,7 +63,8 @@ class Callback:
 		elif self.in_guid: self.guid += data
 	def endDocument(self): pass
 	def startElement(self, tag, attrs):
-		if tag == "item": self.in_item = True
+		if tag == "item":
+			self.in_item = True
 		if tag == "guid":
 			self.guid = ""
 			self.in_guid = True
@@ -70,7 +73,8 @@ class Callback:
 	def endElement(self, tag):
 		if tag == "item":
 			self.in_item = False
-			self.article_ids.add(self.current_guid)
+			if self.current_guid and self.current_guid not in self.article_ids:
+				self.new_article_ids.add(self.current_guid)
 		if tag == "guid":
 			self.in_guid = False
 			self.current_guid = sha.new(self.guid).hexdigest()
@@ -82,9 +86,9 @@ class Callback:
 			self.chars = ""
 			self.gather_chars = False

-def RSSGenerator(data_generator, article_ids=[]):
+def RSSGenerator(data_generator, article_ids, new_article_ids):
 	queue = []
-	callback = Callback(queue, article_ids)
+	callback = Callback(queue, article_ids, new_article_ids)
 	ctxt = libxml2.createPushParser(callback, "", 0, "")
 	for data in data_generator:
 		ctxt.parseChunk(data, len(data), 0)
@@ -95,8 +99,9 @@ def RSSGenerator(data_generator, article
 	queue.reverse()
 	while len(queue):
 		yield queue.pop()
-
-def HTTPGenerator(host, uri):
+
+def HTTPGenerator(uri):
+	host = urlparse.urlparse(uri)[1]
 	fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 	fd.connect((host, 80))
 	fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))