From: dsc Date: Wed, 21 Dec 2011 22:38:08 +0000 (-0800) Subject: find_haiku.py script X-Git-Url: http://git.less.ly:3516/?a=commitdiff_plain;h=74e0945c19d0d24c612753893cf2b8790143f2df;p=crisishaiku.git find_haiku.py script --- diff --git a/README.md b/README.md index 01532de..b13379b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,16 @@ # Financial Crisis Haiku Did you know that the Financial Crisis Inquiry Report increased the US Gross National Haiku Quotient by 1.8%, the largest single increase every affected by a congressional report? + + +## Features + +Pages: +- **Home**: Best Of / Staff Picks, Longest Chains; Popular, Mentioned on Twitter (etc) +- **Haiku Page**: unique URL per haiku (plus short URL); tags, context, favs, ratings, comments ("share your story"?), sharing (tweet this, share on fb, AddThis), mentions (on twitter/fb) +- **Report**: split out by-chapter; hilited haikus link to Haiku Page; hilited chains; per-line comments? +- **Search**: fulltext of haikus, report by chapter +- **Users**: Favorites, Rate, Comment, Tag; signup required -- connect via FB, Twitter, GitHub, Google +- **Download**: links to zips of the haikus, the report; source on GitHub + + diff --git a/bin/find_haiku.py b/bin/find_haiku.py index d9a7a13..1495746 100755 --- a/bin/find_haiku.py +++ b/bin/find_haiku.py @@ -1,72 +1,147 @@ -import codecs, msgpack, cjson, re, sys -from path import path +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" Processes the Financial Crisis Inquiry Report to find bittersweet haikus. +""" + +__author__ = 'David Schoonover ' +__copyright__ = 'Copyright (c) 2011 David Schoonover. All rights reserved.' +__homepage__ = 'http://crisishaiku.com/' +from crisishaiku import __version__, VERSION + +import sys, re, argparse, codecs + from hyphen import Hyphenator +from path import path +import anyjson as json +# json.force_implementation('jsonlib2') + +import crisishaiku + + +# Constants Infinity = float('inf') STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+') -VAR = path('var') -STATEPATH = VAR/'state.json' -HAIKUSPATH = VAR/'haikus.txt' +VAR_DIR = path('var') +STATE_FILE = 'state.json' +SYLLABLE_FILE = 'syllables.json' +REPORT_FILE = path('data/fcir.txt') -if not VAR.exists(): VAR.makedirs() +OUT_DIR = VAR_DIR +OUTFILE_OVERLAP = 'haikus.txt' +OUTFILE_NO_OVERLAP = 'haikus-no_overlap.txt' +OUTFILE_CHAINS = 'haikus-longest_chains.txt' -DATA = path('data') -BOOK_DATAPATH = DATA/'fcic.txt' -SYLLABLE_DATAPATH = DATA/'syllables.msgpack' -SYLLABLE_CACHE = {} -if SYLLABLE_DATAPATH.exists(): - with SYLLABLE_DATAPATH.open('rb') as f: - SYLLABLE_CACHE = msgpack.load(f) or {} -def saveCache(): - with SYLLABLE_DATAPATH.open('wb') as f: - msgpack.dump(SYLLABLE_CACHE, f) +# Haiku Finder script -class FinancialCrisis(object): - verbose = False - noOverlap = False # whehter haikus can overlap +class HaikuFinder(object): + __doc__ - start_line = 0 - limit = Infinity - haikus = [] # Results (haiku, line_no) - words = None # Cache previous previous 23 pairs: (word, syllables) + # Class vars + hyphenator = Hyphenator('en_US') + cache = {} # Syllable cache - # Counters + + # Setup + start_line = 0 + limit = Infinity + overlap = True + verbose = False + + out_dir = OUT_DIR + var_dir = VAR_DIR + cachefile = VAR_DIR/SYLLABLE_FILE + statefile = VAR_DIR/STATE_FILE + reportfile = REPORT_FILE + + + # State seen_words = 0 seen_lines = 0 + haikus = [] # Results (haiku, line_no) + words = [] # Cache previous previous 23 pairs: (word, syllables) - def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False): - self.start_line = start_line - self.limit = limit + + def __init__(self, out_dir=OUT_DIR, start_line=0, limit=Infinity, overlap=True, verbose=False, var_dir=VAR_DIR, reportfile=REPORT_FILE): + self.start_line = start_line + self.limit = limit + self.overlap = overlap + self.verbose = verbose + + self.var_dir = path(var_dir) + if not self.var_dir.exists(): self.var_dir.makedirs() + self.out_dir = path(out_dir) + if not self.out_dir.exists(): self.out_dir.makedirs() + + if outfile is None: + outfile = OUTFILE_OVERLAP if self.overlap else OUTFILE_NO_OVERLAP + self.outfile = self.out_dir/outfile + else: + self.outfile = path(outfile) + + self.reportfile = reportfile + self.cachefile = self.var_dir/SYLLABLE_FILE + self.statefile = self.var_dir/STATE_FILE + self.seen_words = 0 self.seen_lines = 0 - self.noOverlap = noOverlap - self.verbose = verbose - self.book = str(book) - self.hyphenator = Hyphenator('en_US') - self.words = [] self.haikus = [] + self.words = [] + + self.loadCache() + + + + def loadCache(self): + "Load the syllable cache from disk." + if self.cachefile.exists(): + with cachefile.open('rb') as f: + cache = json.load(f) or {} + if cache and not self.__class__.cache: + self.__class__.cache = cache + else: + self.cache = cache + return self + + def saveCache(self): + "Save the syllable cache to disk." + with self.cachefile.open('wb') as f: + json.dump(self.cache, f) + return self + + def save(self, statefile=None): + "Save the search state." + self.saveCache() + if statefile is None: statefile = self.statefile + FIELDS = 'words haikus seen_lines seen_words'.split() + state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS } + with codecs.open(statefile, 'w', 'utf-8') as f: + json.dump(state, f) + return self def numSyllables(self, word): + "Calculate number of syllables in `word`." word = unicode( STRIP_PAT.subn(u'', word)[0] ).strip() # print '[WORD] %s' % word if not word or len(word) >= 100: return 0 - if word not in SYLLABLE_CACHE: # XXX: zeros? + if word not in self.cache: # XXX: zeros? try: - SYLLABLE_CACHE[word] = max(len(self.hyphenator.syllables(word)), 1) + self.cache[word] = max(len(self.hyphenator.syllables(word)), 1) except: print word raise - return SYLLABLE_CACHE[word] + return self.cache[word] def findStanza(self, pairs, goal=7): + "Attempt to find a stanza of `goal` syllables in the given list of `(word, syllables)` pairs." stanza = [] size = 0 for word, syllables in pairs: @@ -79,6 +154,7 @@ class FinancialCrisis(object): return [] def offer(self, word): + "Process the next word." self.seen_words += 1 syllables = self.numSyllables(word) @@ -112,17 +188,25 @@ class FinancialCrisis(object): self.haikus.append( (haiku, self.seen_lines, self.seen_words-offset, self.seen_words) ) - if self.noOverlap: + if not self.overlap: self.words = [] return haiku - def process(self): - print 'Starting Haiku processing on line %s...' % self.seen_lines - with codecs.open(self.book, 'rU', 'utf-8') as f: + + def run(self, reportfile=None, start_line=None, limit=None): + "Process the report." + if reportfile is None: reportfile = self.reportfile + if start_line is None: start_line = self.start_line + if limit is None: limit = self.limit + + print 'Processing %s, starting on line %s...' % (reportfile, start_line) + + start = time.time() + with codecs.open(reportfile, 'rU', 'utf-8') as f: for line_no, line in enumerate(f): - if line_no < self.start_line: continue - if line_no >= self.limit: break + if line_no < start_line: continue + if line_no >= limit: break self.seen_lines += 1 for word in line.split(): @@ -131,32 +215,23 @@ class FinancialCrisis(object): if False and self.seen_lines % 1000 == 0: print '-' * 20 - print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines) + print '\nFound %s haiku in %s lines (%s words) so far, taking %ss...' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start) self.printHaiku(self.haikus[-1]) - print 'Done!' + + print 'Done! Found %s haiku in %s lines (%s words), taking %ss' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start) + return self - def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False): - haiku, linenum, start_words, end_words = info - if header: - wordloc = '' - if wordlocs and start_words and end_words: - wordloc = ' (words %s to %s)' % (start_words, end_words) - outfile.write('On line %s%s:\n' % (linenum, wordloc)) - lines = [ ' '.join(stanza) for stanza in haiku ] - # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ] - for line in lines: - outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n') - outfile.write('\n') - - def saveHaikus(self, outpath=HAIKUSPATH): - print 'Saving %s haiku to %s...' % (len(self.haikus), outpath) + def saveHaikus(self, outfile=None, dump_chains=False): + "Write all haikus to `outfile`." + if outfile is None: outfile = self.outfile + print 'Saving %s haiku to %s...' % (len(self.haikus), outfile) last_wc = 0 chains = [] chain = [] - with codecs.open(outpath, 'w', 'utf-8') as out: + with codecs.open(outfile, 'w', 'utf-8') as out: out.write('Found %s haiku...\n\n' % len(self.haikus)) for info in self.haikus: haiku, linenum, start_words, end_words = info @@ -172,31 +247,84 @@ class FinancialCrisis(object): last_wc = end_words self.printHaiku(info, header=header, outfile=out) - print '\nLongest Chains of Haikus:' - chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True) - for (length, chain) in chains[:10]: - print ('- ' * 40) + '\n' - for haiku in chain: - self.printHaiku((haiku, 0, 0, 0), header=False) + if dump_chains: + print '\nLongest Chains of Haikus:' + chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True) + for (length, chain) in chains[:10]: + print ('- ' * 40) + '\n' + for haiku in chain: + self.printHaiku((haiku, 0, 0, 0), header=False) print '- ' * 40 print '\nDone!' + return self + + def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False): + "Print haiku and metadata to `outfile`." + haiku, linenum, start_words, end_words = info + if header: + wordloc = '' + if wordlocs and start_words and end_words: + wordloc = ' (words %s to %s)' % (start_words, end_words) + outfile.write('On line %s%s:\n' % (linenum, wordloc)) + lines = [ ' '.join(stanza) for stanza in haiku ] + # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ] + for line in lines: + outfile.write(u' {line: ^80}'.format(line=line).rstrip()+'\n') + outfile.write('\n') + return self def printHaikus(self): + "Print all haikus to stdout." print '-' * 20 print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines) for info in self.haikus: self.printHaiku(info) print + return self - def save(self, statepath=STATEPATH): - saveCache() - FIELDS = 'words haikus seen_lines seen_words'.split() - state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS } - with codecs.open(statepath, 'w', 'utf-8') as f: - f.write(cjson.encode(state)) + # Script Arguments + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--version', action='version', version=__version__) + parser.add_argument("-v", "--verbose", action="store_true", default=verbose, + help="Verbose logging.") + + parser.add_argument("-s", "--start-line", type=int, default=start_line, + help="Line in FCIR to start haiku processing. [default: %(default)s]") + parser.add_argument("-l", "--limit", type=int, default=limit, + help="Stop processing after finding this many haiku.") + parser.add_argument("-o", "--overlap", action="store_true", dest="overlap", default=overlap, + help="Allow haiku text to overlap in FCIR. [Default: %(default)s]") + parser.add_argument("-O", "--no-overlap", action="store_false", dest="overlap", + help="Do not allow haiku text to overlap in FCIR. [Default: %(default)s]") + + parser.add_argument("-d", "--var-dir", type=path, default=var_dir, + help="Working directory for state files. [default: %(default)s]") + parser.add_argument("--report-file", type=path, dest="reportfile", default=reportfile, + help="Path to Financial Crisis Inquiry Report plaintext file. [default: %(default)s]") + + parser.add_argument("out-dir", nargs='?', type=path, default=out_dir, + help="Directory to write result files. [default: %(default)s]") + + + @classmethod + def parse(cls, *args, **overrides): + parsed = cls.parser.parse_args(args or None) + values = dict(**parsed.__dict__) + values.update(overrides) + return values + + @classmethod + def main(cls, *args, **overrides): + values = cls.parse(*args, **overrides) + app = cls(**values) + return app.run() or 0 + +if __name__ == '__main__': + sys.exit(HaikuFinder.main()) diff --git a/crisishaiku/__init__.py b/crisishaiku/__init__.py index e69de29..66cec86 100644 --- a/crisishaiku/__init__.py +++ b/crisishaiku/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__version__ = '0.0.1' +VERSION = tuple(map(int, __version__.split('.'))) + diff --git a/crisishaiku/cli/__init__.py b/crisishaiku/cli/__init__.py new file mode 100644 index 0000000..3a1213c --- /dev/null +++ b/crisishaiku/cli/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + diff --git a/crisishaiku/cli/pathtype.py b/crisishaiku/cli/pathtype.py new file mode 100644 index 0000000..9ff03e5 --- /dev/null +++ b/crisishaiku/cli/pathtype.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys, codecs, locale +import argparse +from path import path + + +__all__ = ('FileType', 'PathType', 'DirectoryType', 'PathTypeError',) + + +class PathTypeError(TypeError): + """ TypeError that provides `path` and `type` attributes tracking expectations. """ + + def __init__(self, message, filepath, pathtype): + super(PathTypeError, self).__init__(message, filepath, pathtype) + self.message = message + self.path = filepath + self.type = pathtype + + + +class FileType(argparse.FileType): + """Factory for creating file object types + + Instances of FileType are typically passed as type= arguments to the + ArgumentParser add_argument() method. + + Keyword Arguments: + - mode='r' -- A string indicating how the file is to be opened. Accepts the + same values as the builtin open() function. + - encoding=None -- The file's encoding. None is treated as per the `codecs` + module (as bytes). + - errors='strict' -- Error handling as defined in the `codecs` module: + 'strict', 'ignore', 'replace', 'xmlcharrefreplace', 'backslashreplace' + - bufsize=-1 -- The file's desired buffer size. Accepts the same values as + the builtin open() function. + """ + + def __init__(self, mode='r', encoding=None, errors='strict', bufsize=-1): + self._mode = mode + self._encoding = encoding + self._errors = errors + self._bufsize = bufsize + + def __call__(self, f): + mode = self._mode + enc = self._encoding + + # the special path "-" means sys.std{in,out} + if f == '-': + if 'r' in mode: + f = '/dev/stdin' + enc = enc or sys.stdin.encoding or locale.getpreferredencoding().lower() + elif 'w' in mode: + f = '/dev/stdout' + enc = enc or sys.stdout.encoding or locale.getpreferredencoding().lower() + else: + msg = _('argument "-" with mode %r') % mode + raise ValueError(msg) + + # all other paths are used as ... paths + try: + return codecs.open( f, mode=mode, encoding=enc or None, + errors=self._errors, buffering=self._bufsize ) + except IOError as e: + message = _("can't open '%s': %s") + raise ArgumentTypeError(message % (f, e)) + + def __repr__(self): + args = self._mode, self._encoding, self._errors, self._bufsize + args_str = ', '.join(repr(arg) for arg in args if arg != -1) + return '%s(%s)' % (type(self).__name__, args_str) + + + +class PathType(object): + """ Factory for validating a path and wrapping it as a `path`. + + Keyword Arguments: + - base=u'' -- Base path to resolve the passed path from. + - mustExist=False -- Validate directory exists, raising OSError otherwise. + - expand=True -- Expand the path. + - abspath=False -- Resolve the absolute path. + """ + base = u'' + mustExist = True + expand = True + abspath = False + + + def __init__(self, base=u'', mustExist=True, expand=True, abspath=False): + self.base = path(base) + self.mustExist = mustExist + self.expand = expand + self.abspath = abspath + + + def checkExists(self, p): + if self.mustExist and not p.exists(): + raise OSError(2, 'No such file or directory', p) + return p + + def __call__(self, p): + p = self.base/p + if self.expand: + p = p.expand() + if self.abspath(): + p = p.abspath() + return self.checkExists(p) + + + def __repr__(self): + return "%s(%s)" % ( type(self).__name__, + ', '.join( '%s=%r' % (k,v) for k,v in self.__dict__.items() if not k[0] == '_' ) ) + + + +class DirectoryType(PathType): + """ Factory for validating a directory path and wrapping it as a `path`. + """ + mkdirs = True + + + def __init__(self, base=u'', mkdirs=True, mustExist=False, expand=True, abspath=False): + """ Factory for validating a directory path and wrapping it as a `path`. If a given + path is not a directory, TypeError is raised. + + Keyword Arguments: + - base=u'' -- Base path to resolve the passed path from. + - mkdirs=True -- If directory does not exist, make it and all intermediary + directories. + - mustExist=False -- Validate directory exists, raising OSError otherwise. + - expand=True -- Expand the path. + - abspath=False -- Resolve the absolute path. + """ + super(DirectoryType, self).__init__(base, mustExist, expand, abspath) + self.mkdirs = mkdirs + + + def checkExists(self, p): + if self.mkdirs and not p.exists(): + p.makedirs() + if p.exists() and not p.isdir(): + raise PathTypeError('Path is not a directory', p, self) + return super(PathType, self).checkExists(p) + + + diff --git a/data/fcic.txt b/data/fcir.txt similarity index 100% rename from data/fcic.txt rename to data/fcir.txt diff --git a/setup.py b/setup.py index ddd0fb4..f32cc68 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,11 @@ setup( # entry_points = { 'console_scripts':['crisishaiku = crisishaiku:CrisisHaiku.main'] }, install_requires = [ - "bunch >= 1.0", - "PyYAML >= 3.10", + 'bunch >= 1.0', + 'PyYAML >= 3.10', + 'jsonlib2 >= 1.5.2', + 'anyjson >= 0.3.1', + 'PyHyphen >= 1.0beta1', ], keywords = 'crisishaiku crisis haiku poetry web',