-import codecs, msgpack, cjson, re, sys
-from path import path
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+""" Processes the Financial Crisis Inquiry Report to find bittersweet haikus.
+"""
+
+__author__ = 'David Schoonover <dsc@less.ly>'
+__copyright__ = 'Copyright (c) 2011 David Schoonover. All rights reserved.'
+__homepage__ = 'http://crisishaiku.com/'
+from crisishaiku import __version__, VERSION
+
+import sys, re, argparse, codecs
+
from hyphen import Hyphenator
+from path import path
+import anyjson as json
+# json.force_implementation('jsonlib2')
+
+import crisishaiku
+
+
+# Constants
Infinity = float('inf')
STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+')
-VAR = path('var')
-STATEPATH = VAR/'state.json'
-HAIKUSPATH = VAR/'haikus.txt'
+VAR_DIR = path('var')
+STATE_FILE = 'state.json'
+SYLLABLE_FILE = 'syllables.json'
+REPORT_FILE = path('data/fcir.txt')
-if not VAR.exists(): VAR.makedirs()
+OUT_DIR = VAR_DIR
+OUTFILE_OVERLAP = 'haikus.txt'
+OUTFILE_NO_OVERLAP = 'haikus-no_overlap.txt'
+OUTFILE_CHAINS = 'haikus-longest_chains.txt'
-DATA = path('data')
-BOOK_DATAPATH = DATA/'fcic.txt'
-SYLLABLE_DATAPATH = DATA/'syllables.msgpack'
-SYLLABLE_CACHE = {}
-if SYLLABLE_DATAPATH.exists():
- with SYLLABLE_DATAPATH.open('rb') as f:
- SYLLABLE_CACHE = msgpack.load(f) or {}
-def saveCache():
- with SYLLABLE_DATAPATH.open('wb') as f:
- msgpack.dump(SYLLABLE_CACHE, f)
+# Haiku Finder script
-class FinancialCrisis(object):
- verbose = False
- noOverlap = False # whehter haikus can overlap
+class HaikuFinder(object):
+ __doc__
- start_line = 0
- limit = Infinity
- haikus = [] # Results (haiku, line_no)
- words = None # Cache previous previous 23 pairs: (word, syllables)
+ # Class vars
+ hyphenator = Hyphenator('en_US')
+ cache = {} # Syllable cache
- # Counters
+
+ # Setup
+ start_line = 0
+ limit = Infinity
+ overlap = True
+ verbose = False
+
+ out_dir = OUT_DIR
+ var_dir = VAR_DIR
+ cachefile = VAR_DIR/SYLLABLE_FILE
+ statefile = VAR_DIR/STATE_FILE
+ reportfile = REPORT_FILE
+
+
+ # State
seen_words = 0
seen_lines = 0
+ haikus = [] # Results (haiku, line_no)
+ words = [] # Cache previous previous 23 pairs: (word, syllables)
- def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False):
- self.start_line = start_line
- self.limit = limit
+
+ def __init__(self, out_dir=OUT_DIR, start_line=0, limit=Infinity, overlap=True, verbose=False, var_dir=VAR_DIR, reportfile=REPORT_FILE):
+ self.start_line = start_line
+ self.limit = limit
+ self.overlap = overlap
+ self.verbose = verbose
+
+ self.var_dir = path(var_dir)
+ if not self.var_dir.exists(): self.var_dir.makedirs()
+ self.out_dir = path(out_dir)
+ if not self.out_dir.exists(): self.out_dir.makedirs()
+
+ if outfile is None:
+ outfile = OUTFILE_OVERLAP if self.overlap else OUTFILE_NO_OVERLAP
+ self.outfile = self.out_dir/outfile
+ else:
+ self.outfile = path(outfile)
+
+ self.reportfile = reportfile
+ self.cachefile = self.var_dir/SYLLABLE_FILE
+ self.statefile = self.var_dir/STATE_FILE
+
self.seen_words = 0
self.seen_lines = 0
- self.noOverlap = noOverlap
- self.verbose = verbose
- self.book = str(book)
- self.hyphenator = Hyphenator('en_US')
- self.words = []
self.haikus = []
+ self.words = []
+
+ self.loadCache()
+
+
+
+ def loadCache(self):
+ "Load the syllable cache from disk."
+ if self.cachefile.exists():
+ with cachefile.open('rb') as f:
+ cache = json.load(f) or {}
+ if cache and not self.__class__.cache:
+ self.__class__.cache = cache
+ else:
+ self.cache = cache
+ return self
+
+ def saveCache(self):
+ "Save the syllable cache to disk."
+ with self.cachefile.open('wb') as f:
+ json.dump(self.cache, f)
+ return self
+
+ def save(self, statefile=None):
+ "Save the search state."
+ self.saveCache()
+ if statefile is None: statefile = self.statefile
+ FIELDS = 'words haikus seen_lines seen_words'.split()
+ state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS }
+ with codecs.open(statefile, 'w', 'utf-8') as f:
+ json.dump(state, f)
+ return self
def numSyllables(self, word):
+ "Calculate number of syllables in `word`."
word = unicode( STRIP_PAT.subn(u'', word)[0] ).strip()
# print '[WORD] %s' % word
if not word or len(word) >= 100:
return 0
- if word not in SYLLABLE_CACHE: # XXX: zeros?
+ if word not in self.cache: # XXX: zeros?
try:
- SYLLABLE_CACHE[word] = max(len(self.hyphenator.syllables(word)), 1)
+ self.cache[word] = max(len(self.hyphenator.syllables(word)), 1)
except:
print word
raise
- return SYLLABLE_CACHE[word]
+ return self.cache[word]
def findStanza(self, pairs, goal=7):
+ "Attempt to find a stanza of `goal` syllables in the given list of `(word, syllables)` pairs."
stanza = []
size = 0
for word, syllables in pairs:
return []
def offer(self, word):
+ "Process the next word."
self.seen_words += 1
syllables = self.numSyllables(word)
self.haikus.append(
(haiku, self.seen_lines, self.seen_words-offset, self.seen_words) )
- if self.noOverlap:
+ if not self.overlap:
self.words = []
return haiku
- def process(self):
- print 'Starting Haiku processing on line %s...' % self.seen_lines
- with codecs.open(self.book, 'rU', 'utf-8') as f:
+
+ def run(self, reportfile=None, start_line=None, limit=None):
+ "Process the report."
+ if reportfile is None: reportfile = self.reportfile
+ if start_line is None: start_line = self.start_line
+ if limit is None: limit = self.limit
+
+ print 'Processing %s, starting on line %s...' % (reportfile, start_line)
+
+ start = time.time()
+ with codecs.open(reportfile, 'rU', 'utf-8') as f:
for line_no, line in enumerate(f):
- if line_no < self.start_line: continue
- if line_no >= self.limit: break
+ if line_no < start_line: continue
+ if line_no >= limit: break
self.seen_lines += 1
for word in line.split():
if False and self.seen_lines % 1000 == 0:
print '-' * 20
- print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
+ print '\nFound %s haiku in %s lines (%s words) so far, taking %ss...' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start)
self.printHaiku(self.haikus[-1])
- print 'Done!'
+
+ print 'Done! Found %s haiku in %s lines (%s words), taking %ss' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start)
+ return self
- def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
- haiku, linenum, start_words, end_words = info
- if header:
- wordloc = ''
- if wordlocs and start_words and end_words:
- wordloc = ' (words %s to %s)' % (start_words, end_words)
- outfile.write('On line %s%s:\n' % (linenum, wordloc))
- lines = [ ' '.join(stanza) for stanza in haiku ]
- # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
- for line in lines:
- outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n')
- outfile.write('\n')
-
- def saveHaikus(self, outpath=HAIKUSPATH):
- print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
+ def saveHaikus(self, outfile=None, dump_chains=False):
+ "Write all haikus to `outfile`."
+ if outfile is None: outfile = self.outfile
+ print 'Saving %s haiku to %s...' % (len(self.haikus), outfile)
last_wc = 0
chains = []
chain = []
- with codecs.open(outpath, 'w', 'utf-8') as out:
+ with codecs.open(outfile, 'w', 'utf-8') as out:
out.write('Found %s haiku...\n\n' % len(self.haikus))
for info in self.haikus:
haiku, linenum, start_words, end_words = info
last_wc = end_words
self.printHaiku(info, header=header, outfile=out)
- print '\nLongest Chains of Haikus:'
- chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
- for (length, chain) in chains[:10]:
- print ('- ' * 40) + '\n'
- for haiku in chain:
- self.printHaiku((haiku, 0, 0, 0), header=False)
+ if dump_chains:
+ print '\nLongest Chains of Haikus:'
+ chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
+ for (length, chain) in chains[:10]:
+ print ('- ' * 40) + '\n'
+ for haiku in chain:
+ self.printHaiku((haiku, 0, 0, 0), header=False)
print '- ' * 40
print '\nDone!'
+ return self
+
+ def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
+ "Print haiku and metadata to `outfile`."
+ haiku, linenum, start_words, end_words = info
+ if header:
+ wordloc = ''
+ if wordlocs and start_words and end_words:
+ wordloc = ' (words %s to %s)' % (start_words, end_words)
+ outfile.write('On line %s%s:\n' % (linenum, wordloc))
+ lines = [ ' '.join(stanza) for stanza in haiku ]
+ # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
+ for line in lines:
+ outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n')
+ outfile.write('\n')
+ return self
def printHaikus(self):
+ "Print all haikus to stdout."
print '-' * 20
print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
for info in self.haikus:
self.printHaiku(info)
print
+ return self
- def save(self, statepath=STATEPATH):
- saveCache()
- FIELDS = 'words haikus seen_lines seen_words'.split()
- state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS }
- with codecs.open(statepath, 'w', 'utf-8') as f:
- f.write(cjson.encode(state))
+ # Script Arguments
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('--version', action='version', version=__version__)
+ parser.add_argument("-v", "--verbose", action="store_true", default=verbose,
+ help="Verbose logging.")
+
+ parser.add_argument("-s", "--start-line", type=int, default=start_line,
+ help="Line in FCIR to start haiku processing. [default: %(default)s]")
+ parser.add_argument("-l", "--limit", type=int, default=limit,
+ help="Stop processing after finding this many haiku.")
+ parser.add_argument("-o", "--overlap", action="store_true", dest="overlap", default=overlap,
+ help="Allow haiku text to overlap in FCIR. [Default: %(default)s]")
+ parser.add_argument("-O", "--no-overlap", action="store_false", dest="overlap",
+ help="Do not allow haiku text to overlap in FCIR. [Default: %(default)s]")
+
+ parser.add_argument("-d", "--var-dir", type=path, default=var_dir,
+ help="Working directory for state files. [default: %(default)s]")
+ parser.add_argument("--report-file", type=path, dest="reportfile", default=reportfile,
+ help="Path to Financial Crisis Inquiry Report plaintext file. [default: %(default)s]")
+
+ parser.add_argument("out-dir", nargs='?', type=path, default=out_dir,
+ help="Directory to write result files. [default: %(default)s]")
+
+
+ @classmethod
+ def parse(cls, *args, **overrides):
+ parsed = cls.parser.parse_args(args or None)
+ values = dict(**parsed.__dict__)
+ values.update(overrides)
+ return values
+
+ @classmethod
+ def main(cls, *args, **overrides):
+ values = cls.parse(*args, **overrides)
+ app = cls(**values)
+ return app.run() or 0
+
+if __name__ == '__main__':
+ sys.exit(HaikuFinder.main())
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys, codecs, locale
+import argparse
+from path import path
+
+
+__all__ = ('FileType', 'PathType', 'DirectoryType', 'PathTypeError',)
+
+
+class PathTypeError(TypeError):
+ """ TypeError that provides `path` and `type` attributes tracking expectations. """
+
+ def __init__(self, message, filepath, pathtype):
+ super(PathTypeError, self).__init__(message, filepath, pathtype)
+ self.message = message
+ self.path = filepath
+ self.type = pathtype
+
+
+
+class FileType(argparse.FileType):
+ """Factory for creating file object types
+
+ Instances of FileType are typically passed as type= arguments to the
+ ArgumentParser add_argument() method.
+
+ Keyword Arguments:
+ - mode='r' -- A string indicating how the file is to be opened. Accepts the
+ same values as the builtin open() function.
+ - encoding=None -- The file's encoding. None is treated as per the `codecs`
+ module (as bytes).
+ - errors='strict' -- Error handling as defined in the `codecs` module:
+ 'strict', 'ignore', 'replace', 'xmlcharrefreplace', 'backslashreplace'
+ - bufsize=-1 -- The file's desired buffer size. Accepts the same values as
+ the builtin open() function.
+ """
+
+ def __init__(self, mode='r', encoding=None, errors='strict', bufsize=-1):
+ self._mode = mode
+ self._encoding = encoding
+ self._errors = errors
+ self._bufsize = bufsize
+
+ def __call__(self, f):
+ mode = self._mode
+ enc = self._encoding
+
+ # the special path "-" means sys.std{in,out}
+ if f == '-':
+ if 'r' in mode:
+ f = '/dev/stdin'
+ enc = enc or sys.stdin.encoding or locale.getpreferredencoding().lower()
+ elif 'w' in mode:
+ f = '/dev/stdout'
+ enc = enc or sys.stdout.encoding or locale.getpreferredencoding().lower()
+ else:
+ msg = _('argument "-" with mode %r') % mode
+ raise ValueError(msg)
+
+ # all other paths are used as ... paths
+ try:
+ return codecs.open( f, mode=mode, encoding=enc or None,
+ errors=self._errors, buffering=self._bufsize )
+ except IOError as e:
+ message = _("can't open '%s': %s")
+ raise ArgumentTypeError(message % (f, e))
+
+ def __repr__(self):
+ args = self._mode, self._encoding, self._errors, self._bufsize
+ args_str = ', '.join(repr(arg) for arg in args if arg != -1)
+ return '%s(%s)' % (type(self).__name__, args_str)
+
+
+
+class PathType(object):
+ """ Factory for validating a path and wrapping it as a `path`.
+
+ Keyword Arguments:
+ - base=u'' -- Base path to resolve the passed path from.
+ - mustExist=False -- Validate directory exists, raising OSError otherwise.
+ - expand=True -- Expand the path.
+ - abspath=False -- Resolve the absolute path.
+ """
+ base = u''
+ mustExist = True
+ expand = True
+ abspath = False
+
+
+ def __init__(self, base=u'', mustExist=True, expand=True, abspath=False):
+ self.base = path(base)
+ self.mustExist = mustExist
+ self.expand = expand
+ self.abspath = abspath
+
+
+ def checkExists(self, p):
+ if self.mustExist and not p.exists():
+ raise OSError(2, 'No such file or directory', p)
+ return p
+
+ def __call__(self, p):
+ p = self.base/p
+ if self.expand:
+ p = p.expand()
+ if self.abspath():
+ p = p.abspath()
+ return self.checkExists(p)
+
+
+ def __repr__(self):
+ return "%s(%s)" % ( type(self).__name__,
+ ', '.join( '%s=%r' % (k,v) for k,v in self.__dict__.items() if not k[0] == '_' ) )
+
+
+
+class DirectoryType(PathType):
+ """ Factory for validating a directory path and wrapping it as a `path`.
+ """
+ mkdirs = True
+
+
+ def __init__(self, base=u'', mkdirs=True, mustExist=False, expand=True, abspath=False):
+ """ Factory for validating a directory path and wrapping it as a `path`. If a given
+ path is not a directory, TypeError is raised.
+
+ Keyword Arguments:
+ - base=u'' -- Base path to resolve the passed path from.
+ - mkdirs=True -- If directory does not exist, make it and all intermediary
+ directories.
+ - mustExist=False -- Validate directory exists, raising OSError otherwise.
+ - expand=True -- Expand the path.
+ - abspath=False -- Resolve the absolute path.
+ """
+ super(DirectoryType, self).__init__(base, mustExist, expand, abspath)
+ self.mkdirs = mkdirs
+
+
+ def checkExists(self, p):
+ if self.mkdirs and not p.exists():
+ p.makedirs()
+ if p.exists() and not p.isdir():
+ raise PathTypeError('Path is not a directory', p, self)
+ return super(PathType, self).checkExists(p)
+
+
+