The unified diff between revisions [092dff1e..] and [0eaf66b5..] is displayed below. It can also be downloaded as a raw diff.
#
#
# add_file "forms.py"
# content [b631b2a653d8c57334c492d8e87cd9296ac07fa4]
#
# add_file "poet3.py"
# content [3c5adc98809bbd5a6df624911190c02c311f2057]
#
# add_file "storage.py"
# content [bff5648f025b275e05135246f0c56236953eaef4]
#
# add_file "syllables.py"
# content [1ddbe0eb3425f3b5cc513e8af525d583ac771aff]
#
# add_file "symbolstate.py"
# content [83408e4998ed66268e39b916225c46e89b976cdb]
#
# add_file "test.py"
# content [65bdbf5e10acc674375486e7889981090bd9255b]
#
# patch "config.py"
# from [2f4bcd09716fcbb8fed88991cb35d2c009e34cfb]
# to [1028ab910fddc15e627d9e16fcd47d75bd0954ec]
#
# patch "generators.py"
# from [1f4da17464aa9497077d16c27ccb84e60453057d]
# to [550b478b5c306ae773cd42e8e5aff775cb0b8aed]
#
# set "poet3.py"
# attr "mtn:execute"
# value "true"
#
============================================================
--- forms.py b631b2a653d8c57334c492d8e87cd9296ac07fa4
+++ forms.py b631b2a653d8c57334c492d8e87cd9296ac07fa4
@@ -0,0 +1,84 @@
+
+import random
+import config
+
+def haiku(doc, form=[5,7,5]):
+ markov = doc.symbol_state.forward_markov
+
+ def pickfrom(possible, total, get_total):
+ k = random.randint(0, total - 1)
+ for seq in possible:
+ k -= get_total(seq)
+ if k < 0:
+ return seq
+ raise Exception("Failed to pick a number - 'total' miscalcuation? (%d, %d)" % (k, total))
+
+ def generate_line(target, state):
+ state = tuple(state)
+ line = []
+ count = 0
+ syl = doc.syllables.lookup
+
+ if len(state) < markov.size:
+ # we'll have to pick a starting point
+
+ # this is the most accurate, but way too slow
+ #
+ #possible = filter(lambda seq: sum(map(syl, seq)) < form[0], markov.scores)
+ #total_possible = sum(map(lambda seq: markov.scores[seq].total, possible))
+ #seq = pickfrom(possible, total_possible, lambda seq: markov.scores[seq].total)
+ #
+ seq = pickfrom(markov.scores, markov.total, lambda seq: markov.scores[seq].total)
+ count += sum(map(syl, seq))
+ line += list(seq)
+ state = seq
+ elif len(state) > markov.size:
+ state = state[-1*markov.size:]
+
+ while count < target:
+ maxsize = target - count
+ score = markov.scores[state]
+
+ # okay, if count + syl(next_token) != target then we need there
+ # to be an entry in the symbolstate for that next potential
+ # symbol. this lets us restrict further and not fall down holes
+ # so often
+ def is_not_deadend(id):
+ next_count = count + syl(id)
+ if next_count == target: return True
+ next_state = (state + (id,))[1:]
+ next_score = markov.scores.get(next_state)
+ return next_score != None
+
+ possible = set(filter(lambda id: syl(id) <= maxsize and is_not_deadend(id), score.scores))
+ print "status:", target, state, score.scores.keys(), possible
+ total_possible = sum(map(lambda tok: score.scores[tok], possible))
+ if not possible:
+ break
+
+ token = pickfrom(possible, total_possible, lambda seq: score.scores[seq])
+ count += syl(token)
+ state = (state + (token,))[1:]
+ line.append(token)
+
+ if count != target:
+ return None
+ else:
+ return line
+
+ rv = []
+ last_line= []
+ for length in form:
+ print "** target length is:", length
+
+ for i in xrange(config.haiku_line_attempts):
+ line = generate_line(length, last_line)
+ if line != None:
+ break
+
+ print "** resulting line is:", line
+ if not line:
+ return None
+ rv.append(line)
+ last_line = line
+ return rv
============================================================
--- poet3.py 3c5adc98809bbd5a6df624911190c02c311f2057
+++ poet3.py 3c5adc98809bbd5a6df624911190c02c311f2057
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+#
+# take poet.py and make it fast / sensible,
+# ready for deployment on angrygoats.net
+#
+
+from forms import haiku
+from generators import RSSGenerator, HTTPGenerator
+from storage import Storage
+from syllables import Syllables
+from symbolstate import SymbolState
+import os
+import config
+
+class Poet3:
+ def __init__(self, uri):
+ self.uri = uri
+ self.storage = Storage(self.uri)
+ self.syllables = Syllables()
+ self.upwritten = self.storage.has_file('has_been_upwritten.txt')
+
+ def update(self):
+ self.storage.require_files(('articles.txt', 'corpus.txt'))
+
+ articles, new_articles = set(), set()
+ for line in (t.strip() for t in self.storage.open('articles.txt')):
+ articles.add(line)
+
+ corpus_fd = self.storage.open('corpus.txt', 'a')
+ for token_seq in RSSGenerator(HTTPGenerator(self.uri), articles, new_articles):
+ for token in token_seq:
+ corpus_fd.write(token+"\n")
+ corpus_fd.close()
+
+ articles_fd = self.storage.open('articles.txt', 'a')
+ for article in new_articles:
+ articles_fd.write(article+"\n")
+ articles_fd.close()
+
+ def build_state(self):
+ self.symbol_state = SymbolState((t.strip() for t in self.storage.open('corpus.txt')))
+ self.symbol_state.update()
+
+ def upwrite(self, to_upwrite):
+ if len(to_upwrite) == 0:
+ return
+
+ corpus_fd = self.storage.open('corpus.txt')
+ upwrite_fd = self.storage.open('upwrite.txt', 'w')
+
+ def get_token_lists():
+ matches = [0] * len(to_upwrite)
+ buffer = []
+ for token in (t.strip() for t in corpus_fd):
+ buffer.append(token)
+ for i, seq in enumerate(to_upwrite):
+ if token == seq[matches[i]]:
+ matches[i] += 1
+ else:
+ matches[i] = 0
+
+ if matches[i] == self.symbol_state.forward_markov.size:
+ yield [' '.join(to_upwrite[i])]
+ matches = [0] * len(to_upwrite)
+ buffer = []
+ break
+
+ longest_match = max(matches)
+ if len(buffer) > longest_match:
+ output_idx = len(buffer)-longest_match
+ output, buffer = buffer[:output_idx], buffer[output_idx:]
+ yield output
+
+ for token_list in get_token_lists():
+ for token in token_list:
+ upwrite_fd.write(token+'\n')
+ upwrite_fd.close()
+ os.rename(self.storage.file('upwrite.txt'),
+ self.storage.file('corpus.txt'))
+
+ self.storage.open('has_been_upwritten.txt', 'w')
+
+def gen_haiku(uri):
+ poet = Poet3(uri)
+ poet.update()
+ poet.build_state()
+ if not poet.upwritten:
+ to_upwrite = poet.symbol_state.chunkable()
+ poet.upwrite(to_upwrite)
+
+ for attempt in xrange(config.haiku_attempts):
+ poem = haiku(poet)
+ if poem != None:
+ break
+
+ if poem == None:
+ raise PoetException("Could not generate you a poem!")
+
+ return attempt, poem
+
+if __name__ == '__main__':
+ uri = 'http://glamdring.local/~grahame/rss'
+ attempts, poem = gen_haiku(uri)
+ print "%d attempts" % attempts
+ for line in poem:
+ print ' '.join(line)
+
+
+
============================================================
--- storage.py bff5648f025b275e05135246f0c56236953eaef4
+++ storage.py bff5648f025b275e05135246f0c56236953eaef4
@@ -0,0 +1,52 @@
+
+import urlparse
+import string
+import sha
+import os
+
+import config
+
+class StorageException(Exception):
+ pass
+
+class Storage:
+ def __init__(self, uri):
+ self.uri = uri
+ self.dir = self.__storage_dir()
+
+ def __storage_dir(self):
+ # list of directories in the storage path to get to this URI
+ dirs = []
+ site = filter(lambda x: x in string.letters or x in string.digits or x == '.', urlparse.urlparse(self.uri)[1])
+ # no .. entries to climb the filesystem :-)
+ site = site.lstrip('.')
+ dirs.append(site)
+
+ # but this should be safe
+ hash = sha.new(self.uri).hexdigest()
+ dirs += [hash[:8], hash[8:16], hash[16:24], hash[24:32], hash[32:40]]
+
+ c_dir = config.storage_path
+ for dir in dirs:
+ c_dir = os.path.join(c_dir, dir)
+ if not os.access(c_dir, os.R_OK):
+ os.mkdir(c_dir)
+ return c_dir
+
+ def file(self, fname):
+ if fname.startswith('/'):
+ raise StorageException("fname may not start with a slash.")
+ return os.path.join(self.dir, fname)
+
+ def has_file(self, fname):
+ return os.access(self.file(fname), os.R_OK)
+
+ def require_files(self, files):
+ for file in files:
+ file = self.file(file)
+ if not os.access(file, os.R_OK):
+ open(file, 'w')
+
+ def open(self, *args):
+ fname, other_args = args[0], args[1:]
+ return open(*[self.file(fname)] + list(other_args))
============================================================
--- syllables.py 1ddbe0eb3425f3b5cc513e8af525d583ac771aff
+++ syllables.py 1ddbe0eb3425f3b5cc513e8af525d583ac771aff
@@ -0,0 +1,31 @@
+
+class Syllables:
+ def __init__(self):
+ self.cache = {}
+
+ def lookup(self, token):
+ if not self.cache.has_key(token):
+ self.cache[token] = self.__syllable_estimate(token)
+ return self.cache[token]
+
+ def __syllable_estimate(self, token):
+ "Last resort syllable counter. Reasonably accurate in English." \
+ "Allegedly works for French."
+ vowels = ['a', 'e', 'i', 'o', 'u', 'y', "'"]
+ l = None
+ count = 0
+ if len(token) == 0:
+ return 0
+ if len(token) <= 3:
+ return 1
+ for c in token:
+ if c in vowels and l not in vowels:
+ count = count + 1
+ l = c
+ if count > 1 and ((token[-1] == 'e' and token[-2] != 'l') or
+ (token[-2] == 'e' and token[-1] == 's')):
+ # silent 'e'
+ count = count - 1
+ if count == 0: count = 1
+ return count
+
============================================================
--- symbolstate.py 83408e4998ed66268e39b916225c46e89b976cdb
+++ symbolstate.py 83408e4998ed66268e39b916225c46e89b976cdb
@@ -0,0 +1,60 @@
+
+import math
+
+class MarkovException(Exception):
+ pass
+
+class MarkovScore:
+ def __init__(self):
+ self.scores = {}
+ self.total = 0
+
+ def add_score(self, token):
+ self.scores.setdefault(token, 0)
+ self.scores[token] += 1
+ self.total += 1
+
+ def entropy(self):
+ if not hasattr(self, 'h'):
+ self.h = -1 * sum(map(lambda p: p * math.log(p, 2),
+ map(lambda x: (self.scores[x] / float(self.total)), self.scores)))
+ return self.h
+
+class MarkovModel:
+ def __init__(self, size):
+ self.total = 0
+ self.size = size
+ self.scores = {}
+
+ def add(self, tokens, token):
+ tokens = tuple(tokens)
+ if len(tokens) != self.size:
+ raise MarkovException("Token list is of incorrect size.")
+ if not self.scores.has_key(tokens):
+ self.scores[tokens] = MarkovScore()
+ self.scores[tokens].add_score(token)
+ self.total += 1
+
+
+class SymbolState:
+ def __init__(self, corpus, chain_size=2):
+ self.forward_markov = MarkovModel(chain_size)
+ self.corpus = corpus
+
+ def update(self):
+ buffer = []
+ for token in self.corpus:
+ if token == None:
+ continue
+ if len(buffer) == self.forward_markov.size:
+ self.forward_markov.add(buffer, token)
+ buffer = buffer[1:]
+ buffer.append(token)
+
+ def chunkable(self):
+ entropies = [self.forward_markov.scores[t].entropy() for t in self.forward_markov.scores]
+ mean_h = sum(entropies) / len(entropies)
+ sd_h = math.sqrt(sum([ pow(t - mean_h, 2) for t in entropies ]) / len(entropies))
+ cutoff = mean_h + 8 * sd_h # should really justify in some way other than 'it works'
+ return filter(lambda tokens: self.forward_markov.scores[tokens].entropy() > cutoff,
+ self.forward_markov.scores)
============================================================
--- test.py 65bdbf5e10acc674375486e7889981090bd9255b
+++ test.py 65bdbf5e10acc674375486e7889981090bd9255b
@@ -0,0 +1,3 @@
+#!/usr/bin/python
+
+pass
============================================================
--- config.py 2f4bcd09716fcbb8fed88991cb35d2c009e34cfb
+++ config.py 1028ab910fddc15e627d9e16fcd47d75bd0954ec
@@ -1,7 +1,12 @@ import os
import os
-install_path = '/Users/grahame/monotone/memes'
+install_path = '/Users/grahame/monotone/memes/'
storage_path = os.path.join(install_path, 'storage')
word_shelf = os.path.join(install_path, 'word.shelf')
word_db = os.path.join(install_path, 'word.db')
+
+user_schema = os.path.join(install_path, 'sql', 'user.sql')
+
+haiku_line_attempts = 3
+haiku_attempts = 3
============================================================
--- generators.py 1f4da17464aa9497077d16c27ccb84e60453057d
+++ generators.py 550b478b5c306ae773cd42e8e5aff775cb0b8aed
@@ -4,6 +4,7 @@ import libxml2
import sha
import socket
import libxml2
+import urlparse
s_splitter = re.compile(r'[\.\!]+')
w_splitter = re.compile(r'[\,\;\"\t\-\(\)\* ]+')
@@ -48,9 +49,10 @@ class Callback:
return rv
class Callback:
- def __init__(self, queue, article_ids):
+ def __init__(self, queue, article_ids, new_article_ids):
self.queue = queue
self.article_ids = article_ids
+ self.new_article_ids = new_article_ids
def startDocument(self):
self.gather_chars = False
self.in_item = False
@@ -61,7 +63,8 @@ class Callback:
elif self.in_guid: self.guid += data
def endDocument(self): pass
def startElement(self, tag, attrs):
- if tag == "item": self.in_item = True
+ if tag == "item":
+ self.in_item = True
if tag == "guid":
self.guid = ""
self.in_guid = True
@@ -70,7 +73,8 @@ class Callback:
def endElement(self, tag):
if tag == "item":
self.in_item = False
- self.article_ids.add(self.current_guid)
+ if self.current_guid and self.current_guid not in self.article_ids:
+ self.new_article_ids.add(self.current_guid)
if tag == "guid":
self.in_guid = False
self.current_guid = sha.new(self.guid).hexdigest()
@@ -82,9 +86,9 @@ class Callback:
self.chars = ""
self.gather_chars = False
-def RSSGenerator(data_generator, article_ids=[]):
+def RSSGenerator(data_generator, article_ids, new_article_ids):
queue = []
- callback = Callback(queue, article_ids)
+ callback = Callback(queue, article_ids, new_article_ids)
ctxt = libxml2.createPushParser(callback, "", 0, "")
for data in data_generator:
ctxt.parseChunk(data, len(data), 0)
@@ -95,8 +99,9 @@ def RSSGenerator(data_generator, article
queue.reverse()
while len(queue):
yield queue.pop()
-
-def HTTPGenerator(host, uri):
+
+def HTTPGenerator(uri):
+ host = urlparse.urlparse(uri)[1]
fd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
fd.connect((host, 80))
fd.send("GET %s HTTP/1.0\r\nHost: %s\r\nUser-agent: http://grahame.angrygoats.net/lj-haiku/; grahame@angrygoats.net\r\nX-Goat: yes\r\n\r\n" % (uri, host))