From: dsc Date: Sun, 18 Dec 2011 10:50:09 +0000 (-0800) Subject: Updates to search for the longest haiku chain. X-Git-Url: http://git.less.ly:3516/?a=commitdiff_plain;h=2bea1b72fceb151c58c064bb8a32772b9bc6446d;p=crisishaiku.git Updates to search for the longest haiku chain. --- diff --git a/crisishaiku.py b/crisishaiku.py index f377bfa..d9a7a13 100644 --- a/crisishaiku.py +++ b/crisishaiku.py @@ -2,6 +2,8 @@ import codecs, msgpack, cjson, re, sys from path import path from hyphen import Hyphenator +Infinity = float('inf') + STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+') VAR = path('var') @@ -25,7 +27,11 @@ def saveCache(): class FinancialCrisis(object): + verbose = False + noOverlap = False # whehter haikus can overlap + start_line = 0 + limit = Infinity haikus = [] # Results (haiku, line_no) words = None # Cache previous previous 23 pairs: (word, syllables) @@ -34,15 +40,17 @@ class FinancialCrisis(object): seen_lines = 0 - def __init__(self, start_line=0, book=BOOK_DATAPATH): + def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False): + self.start_line = start_line + self.limit = limit + self.seen_words = 0 + self.seen_lines = 0 + self.noOverlap = noOverlap + self.verbose = verbose self.book = str(book) self.hyphenator = Hyphenator('en_US') self.words = [] self.haikus = [] - - self.start_line = start_line - self.seen_words = 0 - self.seen_lines = 0 def numSyllables(self, word): @@ -74,56 +82,112 @@ class FinancialCrisis(object): self.seen_words += 1 syllables = self.numSyllables(word) - if syllables < 1: return + if syllables < 1: + if self.verbose: print 'Dropping: %r' % (word) + return + self.words.insert(0, (word, syllables)) if len(self.words) > 23: self.words.pop() - stanza3 = self.findStanza( self.words, 5 ) + offset = 0 + stanza3 = self.findStanza( self.words[offset:], 5 ) if not stanza3: return - stanza2 = self.findStanza( self.words[len(stanza3):], 7 ) + offset += len(stanza3) + stanza2 = self.findStanza( self.words[offset:], 7 ) if not stanza2: return - stanza1 = self.findStanza( self.words[len(stanza3)+len(stanza2):], 5 ) + offset += len(stanza2) + stanza1 = self.findStanza( self.words[offset:], 5 ) if not stanza1: return + offset += len(stanza1) haiku = [stanza1, stanza2, stanza3] - self.haikus.append( (haiku, self.seen_lines) ) + if self.verbose: + print 'Found haiku (line %s):' % self.seen_lines + print '\t', ' '.join(stanza1), '/', ' '.join(stanza2), '/', ' '.join(stanza3) + print 'Word buffer:' + print '\t', repr(self.words) + print + + self.haikus.append( + (haiku, self.seen_lines, self.seen_words-offset, self.seen_words) ) + + if self.noOverlap: + self.words = [] + return haiku def process(self): print 'Starting Haiku processing on line %s...' % self.seen_lines with codecs.open(self.book, 'rU', 'utf-8') as f: for line_no, line in enumerate(f): - if line_no < self.start_line: continue + if line_no < self.start_line: continue + if line_no >= self.limit: break self.seen_lines += 1 for word in line.split(): if not word: continue haiku = self.offer(word) - if self.seen_lines % 1000 == 0: + if False and self.seen_lines % 1000 == 0: print '-' * 20 print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines) - self.printHaiku(*self.haikus[-1]) + self.printHaiku(self.haikus[-1]) print 'Done!' - def printHaikus(self): - print '-' * 20 - print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines) - for haiku, linenum in self.haikus: - self.printHaiku(haiku, linenum) - print - - def printHaiku(self, haiku, linenum, outfile=sys.stdout): - outfile.write('On line %s:\n' % linenum) + def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False): + haiku, linenum, start_words, end_words = info + if header: + wordloc = '' + if wordlocs and start_words and end_words: + wordloc = ' (words %s to %s)' % (start_words, end_words) + outfile.write('On line %s%s:\n' % (linenum, wordloc)) lines = [ ' '.join(stanza) for stanza in haiku ] # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ] - maxlen = max(map(len, lines)) for line in lines: - outfile.write(u' {line: ^{maxlen}}\n'.format(line=line, maxlen=maxlen)) + outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n') outfile.write('\n') + def saveHaikus(self, outpath=HAIKUSPATH): + print 'Saving %s haiku to %s...' % (len(self.haikus), outpath) + + last_wc = 0 + chains = [] + chain = [] + + with codecs.open(outpath, 'w', 'utf-8') as out: + out.write('Found %s haiku...\n\n' % len(self.haikus)) + for info in self.haikus: + haiku, linenum, start_words, end_words = info + + if last_wc == start_words: + header = False + chain.append(haiku) + else: + header = True + chain = [haiku] + chains.append(chain) + + last_wc = end_words + self.printHaiku(info, header=header, outfile=out) + + print '\nLongest Chains of Haikus:' + chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True) + for (length, chain) in chains[:10]: + print ('- ' * 40) + '\n' + for haiku in chain: + self.printHaiku((haiku, 0, 0, 0), header=False) + + print '- ' * 40 + print '\nDone!' + + def printHaikus(self): + print '-' * 20 + print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines) + for info in self.haikus: + self.printHaiku(info) + print def save(self, statepath=STATEPATH): saveCache() @@ -132,13 +196,6 @@ class FinancialCrisis(object): with codecs.open(statepath, 'w', 'utf-8') as f: f.write(cjson.encode(state)) - def saveHaikus(self, outpath=HAIKUSPATH): - print 'Saving %s haiku to %s...' % (len(self.haikus), outpath) - with codecs.open(outpath, 'w', 'utf-8') as out: - out.write('Found %s haiku...\n\n' % len(self.haikus)) - for haiku, linenum in self.haikus: - self.printHaiku(haiku, linenum, out) - print 'Done!' diff --git a/data/syllables.msgpack b/data/syllables.msgpack index 3253fc4..53bf294 100644 Binary files a/data/syllables.msgpack and b/data/syllables.msgpack differ