from path import path
from hyphen import Hyphenator
+Infinity = float('inf')
+
STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+')
VAR = path('var')
class FinancialCrisis(object):
+ verbose = False
+ noOverlap = False # whehter haikus can overlap
+
start_line = 0
+ limit = Infinity
haikus = [] # Results (haiku, line_no)
words = None # Cache previous previous 23 pairs: (word, syllables)
seen_lines = 0
- def __init__(self, start_line=0, book=BOOK_DATAPATH):
+ def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False):
+ self.start_line = start_line
+ self.limit = limit
+ self.seen_words = 0
+ self.seen_lines = 0
+ self.noOverlap = noOverlap
+ self.verbose = verbose
self.book = str(book)
self.hyphenator = Hyphenator('en_US')
self.words = []
self.haikus = []
-
- self.start_line = start_line
- self.seen_words = 0
- self.seen_lines = 0
def numSyllables(self, word):
self.seen_words += 1
syllables = self.numSyllables(word)
- if syllables < 1: return
+ if syllables < 1:
+ if self.verbose: print 'Dropping: %r' % (word)
+ return
+
self.words.insert(0, (word, syllables))
if len(self.words) > 23:
self.words.pop()
- stanza3 = self.findStanza( self.words, 5 )
+ offset = 0
+ stanza3 = self.findStanza( self.words[offset:], 5 )
if not stanza3: return
- stanza2 = self.findStanza( self.words[len(stanza3):], 7 )
+ offset += len(stanza3)
+ stanza2 = self.findStanza( self.words[offset:], 7 )
if not stanza2: return
- stanza1 = self.findStanza( self.words[len(stanza3)+len(stanza2):], 5 )
+ offset += len(stanza2)
+ stanza1 = self.findStanza( self.words[offset:], 5 )
if not stanza1: return
+ offset += len(stanza1)
haiku = [stanza1, stanza2, stanza3]
- self.haikus.append( (haiku, self.seen_lines) )
+ if self.verbose:
+ print 'Found haiku (line %s):' % self.seen_lines
+ print '\t', ' '.join(stanza1), '/', ' '.join(stanza2), '/', ' '.join(stanza3)
+ print 'Word buffer:'
+ print '\t', repr(self.words)
+ print
+
+ self.haikus.append(
+ (haiku, self.seen_lines, self.seen_words-offset, self.seen_words) )
+
+ if self.noOverlap:
+ self.words = []
+
return haiku
def process(self):
print 'Starting Haiku processing on line %s...' % self.seen_lines
with codecs.open(self.book, 'rU', 'utf-8') as f:
for line_no, line in enumerate(f):
- if line_no < self.start_line: continue
+ if line_no < self.start_line: continue
+ if line_no >= self.limit: break
self.seen_lines += 1
for word in line.split():
if not word: continue
haiku = self.offer(word)
- if self.seen_lines % 1000 == 0:
+ if False and self.seen_lines % 1000 == 0:
print '-' * 20
print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
- self.printHaiku(*self.haikus[-1])
+ self.printHaiku(self.haikus[-1])
print 'Done!'
- def printHaikus(self):
- print '-' * 20
- print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
- for haiku, linenum in self.haikus:
- self.printHaiku(haiku, linenum)
- print
-
- def printHaiku(self, haiku, linenum, outfile=sys.stdout):
- outfile.write('On line %s:\n' % linenum)
+ def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
+ haiku, linenum, start_words, end_words = info
+ if header:
+ wordloc = ''
+ if wordlocs and start_words and end_words:
+ wordloc = ' (words %s to %s)' % (start_words, end_words)
+ outfile.write('On line %s%s:\n' % (linenum, wordloc))
lines = [ ' '.join(stanza) for stanza in haiku ]
# lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
- maxlen = max(map(len, lines))
for line in lines:
- outfile.write(u' {line: ^{maxlen}}\n'.format(line=line, maxlen=maxlen))
+ outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n')
outfile.write('\n')
+ def saveHaikus(self, outpath=HAIKUSPATH):
+ print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
+
+ last_wc = 0
+ chains = []
+ chain = []
+
+ with codecs.open(outpath, 'w', 'utf-8') as out:
+ out.write('Found %s haiku...\n\n' % len(self.haikus))
+ for info in self.haikus:
+ haiku, linenum, start_words, end_words = info
+
+ if last_wc == start_words:
+ header = False
+ chain.append(haiku)
+ else:
+ header = True
+ chain = [haiku]
+ chains.append(chain)
+
+ last_wc = end_words
+ self.printHaiku(info, header=header, outfile=out)
+
+ print '\nLongest Chains of Haikus:'
+ chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
+ for (length, chain) in chains[:10]:
+ print ('- ' * 40) + '\n'
+ for haiku in chain:
+ self.printHaiku((haiku, 0, 0, 0), header=False)
+
+ print '- ' * 40
+ print '\nDone!'
+
+ def printHaikus(self):
+ print '-' * 20
+ print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
+ for info in self.haikus:
+ self.printHaiku(info)
+ print
def save(self, statepath=STATEPATH):
saveCache()
with codecs.open(statepath, 'w', 'utf-8') as f:
f.write(cjson.encode(state))
- def saveHaikus(self, outpath=HAIKUSPATH):
- print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
- with codecs.open(outpath, 'w', 'utf-8') as out:
- out.write('Found %s haiku...\n\n' % len(self.haikus))
- for haiku, linenum in self.haikus:
- self.printHaiku(haiku, linenum, out)
- print 'Done!'