Updates to search for the longest haiku chain.

author dsc <david.schoonover@gmail.com>

Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)

committer dsc <david.schoonover@gmail.com>

Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)
author dsc <david.schoonover@gmail.com>
Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)
committer dsc <david.schoonover@gmail.com>
Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)
diff --git a/crisishaiku.py b/crisishaiku.py

index f377bfa..d9a7a13 100644 (file)
--- a/crisishaiku.py
+++ b/crisishaiku.py
@@ -2,6 +2,8 @@ import codecs, msgpack, cjson, re, sys
 from path import path
 from hyphen import Hyphenator
 
+Infinity = float('inf')
+
 STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+')
 
 VAR               = path('var')
@@ -25,7 +27,11 @@ def saveCache():
 
 
 class FinancialCrisis(object):
+    verbose    = False
+    noOverlap  = False  # whehter haikus can overlap
+    
     start_line = 0
+    limit      = Infinity
     haikus     = []     # Results (haiku, line_no)
     words      = None   # Cache previous previous 23 pairs: (word, syllables)
     
@@ -34,15 +40,17 @@ class FinancialCrisis(object):
     seen_lines = 0
     
     
-    def __init__(self, start_line=0, book=BOOK_DATAPATH):
+    def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False):
+        self.start_line = start_line
+        self.limit      = limit
+        self.seen_words = 0
+        self.seen_lines = 0
+        self.noOverlap  = noOverlap
+        self.verbose    = verbose
         self.book       = str(book)
         self.hyphenator = Hyphenator('en_US')
         self.words      = []
         self.haikus     = []
-        
-        self.start_line = start_line
-        self.seen_words = 0
-        self.seen_lines = 0
     
     
     def numSyllables(self, word):
@@ -74,56 +82,112 @@ class FinancialCrisis(object):
         self.seen_words += 1
         
         syllables = self.numSyllables(word)
-        if syllables < 1: return
+        if syllables < 1:
+            if self.verbose: print 'Dropping:   %r' % (word)
+            return
+        
         self.words.insert(0, (word, syllables))
         if len(self.words) > 23:
             self.words.pop()
         
-        stanza3 = self.findStanza( self.words, 5 )
+        offset  = 0
+        stanza3 = self.findStanza( self.words[offset:], 5 )
         if not stanza3: return
-        stanza2 = self.findStanza( self.words[len(stanza3):], 7 )
+        offset += len(stanza3)
+        stanza2 = self.findStanza( self.words[offset:], 7 )
         if not stanza2: return
-        stanza1 = self.findStanza( self.words[len(stanza3)+len(stanza2):], 5 )
+        offset += len(stanza2)
+        stanza1 = self.findStanza( self.words[offset:], 5 )
         if not stanza1: return
+        offset += len(stanza1)
         
         haiku = [stanza1, stanza2, stanza3]
-        self.haikus.append( (haiku, self.seen_lines) )
+        if self.verbose:
+            print 'Found haiku (line %s):' % self.seen_lines
+            print '\t', ' '.join(stanza1), '/', ' '.join(stanza2), '/', ' '.join(stanza3)
+            print 'Word buffer:'
+            print '\t', repr(self.words)
+            print
+        
+        self.haikus.append(
+            (haiku, self.seen_lines, self.seen_words-offset, self.seen_words) )
+        
+        if self.noOverlap:
+            self.words = []
+        
         return haiku
     
     def process(self):
         print 'Starting Haiku processing on line %s...' % self.seen_lines
         with codecs.open(self.book, 'rU', 'utf-8') as f:
             for line_no, line in enumerate(f):
-                if line_no < self.start_line: continue
+                if line_no < self.start_line:   continue
+                if line_no >= self.limit:       break
                 
                 self.seen_lines += 1
                 for word in line.split():
                     if not word: continue
                     haiku = self.offer(word)
                 
-                if self.seen_lines % 1000 == 0:
+                if False and self.seen_lines % 1000 == 0:
                     print '-' * 20
                     print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
-                    self.printHaiku(*self.haikus[-1])
+                    self.printHaiku(self.haikus[-1])
         print 'Done!'
     
     
-    def printHaikus(self):
-        print '-' * 20
-        print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
-        for haiku, linenum in self.haikus:
-            self.printHaiku(haiku, linenum)
-        print
-    
-    def printHaiku(self, haiku, linenum, outfile=sys.stdout):
-        outfile.write('On line %s:\n' % linenum)
+    def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
+        haiku, linenum, start_words, end_words = info
+        if header:
+            wordloc = ''
+            if wordlocs and start_words and end_words:
+                wordloc = ' (words %s to %s)' % (start_words, end_words)
+            outfile.write('On line %s%s:\n' % (linenum, wordloc))
         lines = [ ' '.join(stanza) for stanza in haiku ]
         # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
-        maxlen = max(map(len, lines))
         for line in lines:
-            outfile.write(u'    {line: ^{maxlen}}\n'.format(line=line, maxlen=maxlen))
+            outfile.write(u'    {line: ^80}'.format(line=line).rstrip()+'\n')
         outfile.write('\n')
     
+    def saveHaikus(self, outpath=HAIKUSPATH):
+        print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
+        
+        last_wc = 0
+        chains = []
+        chain = []
+        
+        with codecs.open(outpath, 'w', 'utf-8') as out:
+            out.write('Found %s haiku...\n\n' % len(self.haikus))
+            for info in self.haikus:
+                haiku, linenum, start_words, end_words = info
+                
+                if last_wc == start_words:
+                    header = False
+                    chain.append(haiku)
+                else:
+                    header = True
+                    chain = [haiku]
+                    chains.append(chain)
+                
+                last_wc = end_words
+                self.printHaiku(info, header=header, outfile=out)
+        
+        print '\nLongest Chains of Haikus:'
+        chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
+        for (length, chain) in chains[:10]:
+            print ('- ' * 40) + '\n'
+            for haiku in chain:
+                self.printHaiku((haiku, 0, 0, 0), header=False)
+        
+        print '- ' * 40
+        print '\nDone!'
+    
+    def printHaikus(self):
+        print '-' * 20
+        print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
+        for info in self.haikus:
+            self.printHaiku(info)
+        print
     
     def save(self, statepath=STATEPATH):
         saveCache()
@@ -132,13 +196,6 @@ class FinancialCrisis(object):
         with codecs.open(statepath, 'w', 'utf-8') as f:
             f.write(cjson.encode(state))
     
-    def saveHaikus(self, outpath=HAIKUSPATH):
-        print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
-        with codecs.open(outpath, 'w', 'utf-8') as out:
-            out.write('Found %s haiku...\n\n' % len(self.haikus))
-            for haiku, linenum in self.haikus:
-                self.printHaiku(haiku, linenum, out)
-        print 'Done!'
     
 
 
diff --git a/data/syllables.msgpack b/data/syllables.msgpack

index 3253fc4..53bf294 100644 (file)

Binary files a/data/syllables.msgpack and b/data/syllables.msgpack differ
author	dsc <david.schoonover@gmail.com>
	Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)
committer	dsc <david.schoonover@gmail.com>
	Sun, 18 Dec 2011 10:50:09 +0000 (02:50 -0800)
crisishaiku.py		patch \| blob \| history
data/syllables.msgpack		patch \| blob \| history