From: dsc <david.schoonover@gmail.com>
Date: Wed, 21 Dec 2011 22:38:08 +0000 (-0800)
Subject: find_haiku.py script
X-Git-Url: http://git.less.ly:3516/?a=commitdiff_plain;h=74e0945c19d0d24c612753893cf2b8790143f2df;p=crisishaiku.git

find_haiku.py script
---

diff --git a/README.md b/README.md
index 01532de..b13379b 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,16 @@
 # Financial Crisis Haiku
 
 Did you know that the Financial Crisis Inquiry Report increased the US Gross National Haiku Quotient by 1.8%, the largest single increase every affected by a congressional report?
+
+
+## Features
+
+Pages:
+- **Home**: Best Of / Staff Picks, Longest Chains; Popular, Mentioned on Twitter (etc)
+- **Haiku Page**: unique URL per haiku (plus short URL); tags, context, favs, ratings, comments ("share your story"?), sharing (tweet this, share on fb, AddThis), mentions (on twitter/fb)
+- **Report**: split out by-chapter; hilited haikus link to Haiku Page; hilited chains; per-line comments?
+- **Search**: fulltext of haikus, report by chapter
+- **Users**: Favorites, Rate, Comment, Tag; signup required -- connect via FB, Twitter, GitHub, Google
+- **Download**: links to zips of the haikus, the report; source on GitHub
+
+
diff --git a/bin/find_haiku.py b/bin/find_haiku.py
index d9a7a13..1495746 100755
--- a/bin/find_haiku.py
+++ b/bin/find_haiku.py
@@ -1,72 +1,147 @@
-import codecs, msgpack, cjson, re, sys
-from path import path
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+""" Processes the Financial Crisis Inquiry Report to find bittersweet haikus.
+"""
+
+__author__    = 'David Schoonover <dsc@less.ly>'
+__copyright__ = 'Copyright (c) 2011 David Schoonover. All rights reserved.'
+__homepage__  = 'http://crisishaiku.com/'
+from crisishaiku import __version__, VERSION
+
+import sys, re, argparse, codecs
+
 from hyphen import Hyphenator
+from path import path
+import anyjson as json
+# json.force_implementation('jsonlib2')
+
+import crisishaiku
+
+
+# Constants
 
 Infinity = float('inf')
 
 STRIP_PAT = re.compile(r'[^a-zA-Z\'\-]+')
 
-VAR               = path('var')
-STATEPATH         = VAR/'state.json'
-HAIKUSPATH        = VAR/'haikus.txt'
+VAR_DIR            = path('var')
+STATE_FILE         = 'state.json'
+SYLLABLE_FILE      = 'syllables.json'
+REPORT_FILE        = path('data/fcir.txt')
 
-if not VAR.exists(): VAR.makedirs()
+OUT_DIR            = VAR_DIR
+OUTFILE_OVERLAP    = 'haikus.txt'
+OUTFILE_NO_OVERLAP = 'haikus-no_overlap.txt'
+OUTFILE_CHAINS     = 'haikus-longest_chains.txt'
 
-DATA              = path('data')
-BOOK_DATAPATH     = DATA/'fcic.txt'
-SYLLABLE_DATAPATH = DATA/'syllables.msgpack'
-SYLLABLE_CACHE    = {}
 
-if SYLLABLE_DATAPATH.exists():
-    with SYLLABLE_DATAPATH.open('rb') as f:
-        SYLLABLE_CACHE = msgpack.load(f) or {}
 
-def saveCache():
-    with SYLLABLE_DATAPATH.open('wb') as f:
-        msgpack.dump(SYLLABLE_CACHE, f)
 
+# Haiku Finder script
 
-class FinancialCrisis(object):
-    verbose    = False
-    noOverlap  = False  # whehter haikus can overlap
+class HaikuFinder(object):
+    __doc__
     
-    start_line = 0
-    limit      = Infinity
-    haikus     = []     # Results (haiku, line_no)
-    words      = None   # Cache previous previous 23 pairs: (word, syllables)
+    # Class vars
+    hyphenator = Hyphenator('en_US')
+    cache      = {}     # Syllable cache
     
-    # Counters
+    
+    # Setup
+    start_line  = 0
+    limit       = Infinity
+    overlap     = True
+    verbose     = False
+    
+    out_dir     = OUT_DIR
+    var_dir     = VAR_DIR
+    cachefile   = VAR_DIR/SYLLABLE_FILE
+    statefile   = VAR_DIR/STATE_FILE
+    reportfile  = REPORT_FILE
+    
+    
+    # State
     seen_words = 0
     seen_lines = 0
+    haikus     = []     # Results (haiku, line_no)
+    words      = []     # Cache previous previous 23 pairs: (word, syllables)
     
     
-    def __init__(self, start_line=0, limit=Infinity, book=BOOK_DATAPATH, noOverlap=False, verbose=False):
-        self.start_line = start_line
-        self.limit      = limit
+    
+    def __init__(self, out_dir=OUT_DIR, start_line=0, limit=Infinity, overlap=True, verbose=False, var_dir=VAR_DIR, reportfile=REPORT_FILE):
+        self.start_line  = start_line
+        self.limit       = limit
+        self.overlap     = overlap
+        self.verbose     = verbose
+        
+        self.var_dir     = path(var_dir)
+        if not self.var_dir.exists(): self.var_dir.makedirs()
+        self.out_dir     = path(out_dir)
+        if not self.out_dir.exists(): self.out_dir.makedirs()
+        
+        if outfile is None:
+            outfile = OUTFILE_OVERLAP if self.overlap else OUTFILE_NO_OVERLAP
+            self.outfile = self.out_dir/outfile
+        else:
+            self.outfile = path(outfile)
+        
+        self.reportfile  = reportfile
+        self.cachefile   = self.var_dir/SYLLABLE_FILE
+        self.statefile   = self.var_dir/STATE_FILE
+        
         self.seen_words = 0
         self.seen_lines = 0
-        self.noOverlap  = noOverlap
-        self.verbose    = verbose
-        self.book       = str(book)
-        self.hyphenator = Hyphenator('en_US')
-        self.words      = []
         self.haikus     = []
+        self.words      = []
+        
+        self.loadCache()
+    
+    
+    
+    def loadCache(self):
+        "Load the syllable cache from disk."
+        if self.cachefile.exists():
+            with cachefile.open('rb') as f:
+                cache = json.load(f) or {}
+            if cache and not self.__class__.cache:
+                self.__class__.cache = cache
+            else:
+                self.cache = cache
+        return self
+    
+    def saveCache(self):
+        "Save the syllable cache to disk."
+        with self.cachefile.open('wb') as f:
+            json.dump(self.cache, f)
+        return self
+    
+    def save(self, statefile=None):
+        "Save the search state."
+        self.saveCache()
+        if statefile is None: statefile = self.statefile
+        FIELDS = 'words haikus seen_lines seen_words'.split()
+        state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS }
+        with codecs.open(statefile, 'w', 'utf-8') as f:
+            json.dump(state, f)
+        return self
     
     
     def numSyllables(self, word):
+        "Calculate number of syllables in `word`."
         word = unicode( STRIP_PAT.subn(u'', word)[0] ).strip()
         # print '[WORD] %s' % word
         if not word or len(word) >= 100:
             return 0
-        if word not in SYLLABLE_CACHE: # XXX: zeros?
+        if word not in self.cache: # XXX: zeros?
             try:
-                SYLLABLE_CACHE[word] = max(len(self.hyphenator.syllables(word)), 1)
+                self.cache[word] = max(len(self.hyphenator.syllables(word)), 1)
             except:
                 print word
                 raise
-        return SYLLABLE_CACHE[word]
+        return self.cache[word]
     
     def findStanza(self, pairs, goal=7):
+        "Attempt to find a stanza of `goal` syllables in the given list of `(word, syllables)` pairs."
         stanza = []
         size = 0
         for word, syllables in pairs:
@@ -79,6 +154,7 @@ class FinancialCrisis(object):
         return []
     
     def offer(self, word):
+        "Process the next word."
         self.seen_words += 1
         
         syllables = self.numSyllables(word)
@@ -112,17 +188,25 @@ class FinancialCrisis(object):
         self.haikus.append(
             (haiku, self.seen_lines, self.seen_words-offset, self.seen_words) )
         
-        if self.noOverlap:
+        if not self.overlap:
             self.words = []
         
         return haiku
     
-    def process(self):
-        print 'Starting Haiku processing on line %s...' % self.seen_lines
-        with codecs.open(self.book, 'rU', 'utf-8') as f:
+    
+    def run(self, reportfile=None, start_line=None, limit=None):
+        "Process the report."
+        if reportfile is None: reportfile = self.reportfile
+        if start_line is None: start_line = self.start_line
+        if limit      is None: limit      = self.limit
+        
+        print 'Processing %s, starting on line %s...' % (reportfile, start_line)
+        
+        start = time.time()
+        with codecs.open(reportfile, 'rU', 'utf-8') as f:
             for line_no, line in enumerate(f):
-                if line_no < self.start_line:   continue
-                if line_no >= self.limit:       break
+                if line_no < start_line:   continue
+                if line_no >= limit:       break
                 
                 self.seen_lines += 1
                 for word in line.split():
@@ -131,32 +215,23 @@ class FinancialCrisis(object):
                 
                 if False and self.seen_lines % 1000 == 0:
                     print '-' * 20
-                    print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
+                    print '\nFound %s haiku in %s lines (%s words) so far, taking %ss...' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start)
                     self.printHaiku(self.haikus[-1])
-        print 'Done!'
+        
+        print 'Done!  Found %s haiku in %s lines (%s words), taking %ss' % (len(self.haikus), self.seen_lines, self.seen_words, time.time()-start)
+        return self
     
     
-    def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
-        haiku, linenum, start_words, end_words = info
-        if header:
-            wordloc = ''
-            if wordlocs and start_words and end_words:
-                wordloc = ' (words %s to %s)' % (start_words, end_words)
-            outfile.write('On line %s%s:\n' % (linenum, wordloc))
-        lines = [ ' '.join(stanza) for stanza in haiku ]
-        # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
-        for line in lines:
-            outfile.write(u'    {line: ^80}'.format(line=line).rstrip()+'\n')
-        outfile.write('\n')
-    
-    def saveHaikus(self, outpath=HAIKUSPATH):
-        print 'Saving %s haiku to %s...' % (len(self.haikus), outpath)
+    def saveHaikus(self, outfile=None, dump_chains=False):
+        "Write all haikus to `outfile`."
+        if outfile is None: outfile = self.outfile
+        print 'Saving %s haiku to %s...' % (len(self.haikus), outfile)
         
         last_wc = 0
         chains = []
         chain = []
         
-        with codecs.open(outpath, 'w', 'utf-8') as out:
+        with codecs.open(outfile, 'w', 'utf-8') as out:
             out.write('Found %s haiku...\n\n' % len(self.haikus))
             for info in self.haikus:
                 haiku, linenum, start_words, end_words = info
@@ -172,31 +247,84 @@ class FinancialCrisis(object):
                 last_wc = end_words
                 self.printHaiku(info, header=header, outfile=out)
         
-        print '\nLongest Chains of Haikus:'
-        chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
-        for (length, chain) in chains[:10]:
-            print ('- ' * 40) + '\n'
-            for haiku in chain:
-                self.printHaiku((haiku, 0, 0, 0), header=False)
+        if dump_chains:
+            print '\nLongest Chains of Haikus:'
+            chains = sorted([ (len(chain), chain) for chain in chains ], reverse=True)
+            for (length, chain) in chains[:10]:
+                print ('- ' * 40) + '\n'
+                for haiku in chain:
+                    self.printHaiku((haiku, 0, 0, 0), header=False)
         
         print '- ' * 40
         print '\nDone!'
+        return self
+    
+    def printHaiku(self, info, outfile=sys.stdout, header=True, wordlocs=False):
+        "Print haiku and metadata to `outfile`."
+        haiku, linenum, start_words, end_words = info
+        if header:
+            wordloc = ''
+            if wordlocs and start_words and end_words:
+                wordloc = ' (words %s to %s)' % (start_words, end_words)
+            outfile.write('On line %s%s:\n' % (linenum, wordloc))
+        lines = [ ' '.join(stanza) for stanza in haiku ]
+        # lines = [ ' '.join( '%s[%s]' % (word, self.numSyllables(word)) for word in stanza ) for stanza in haiku ]
+        for line in lines:
+            outfile.write(u'    {line: ^80}'.format(line=line).rstrip()+'\n')
+        outfile.write('\n')
+        return self
     
     def printHaikus(self):
+        "Print all haikus to stdout."
         print '-' * 20
         print '\nFound %s haiku so far (line %s)...' % (len(self.haikus), self.seen_lines)
         for info in self.haikus:
             self.printHaiku(info)
         print
+        return self
     
-    def save(self, statepath=STATEPATH):
-        saveCache()
-        FIELDS = 'words haikus seen_lines seen_words'.split()
-        state = { k:v for k, v in self.__dict__.iteritems() if k in FIELDS }
-        with codecs.open(statepath, 'w', 'utf-8') as f:
-            f.write(cjson.encode(state))
     
     
+    # Script Arguments
+    
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--version', action='version', version=__version__)
+    parser.add_argument("-v", "--verbose", action="store_true", default=verbose,
+        help="Verbose logging.")
+    
+    parser.add_argument("-s", "--start-line", type=int, default=start_line,
+        help="Line in FCIR to start haiku processing. [default: %(default)s]")
+    parser.add_argument("-l", "--limit", type=int, default=limit,
+        help="Stop processing after finding this many haiku.")
+    parser.add_argument("-o", "--overlap",    action="store_true",  dest="overlap", default=overlap,
+        help="Allow haiku text to overlap in FCIR. [Default: %(default)s]")
+    parser.add_argument("-O", "--no-overlap", action="store_false", dest="overlap",
+        help="Do not allow haiku text to overlap in FCIR. [Default: %(default)s]")
+    
+    parser.add_argument("-d", "--var-dir", type=path, default=var_dir,
+        help="Working directory for state files. [default: %(default)s]")
+    parser.add_argument("--report-file", type=path, dest="reportfile", default=reportfile,
+        help="Path to Financial Crisis Inquiry Report plaintext file. [default: %(default)s]")
+    
+    parser.add_argument("out-dir", nargs='?', type=path, default=out_dir,
+        help="Directory to write result files. [default: %(default)s]")
+    
+    
+    @classmethod
+    def parse(cls, *args, **overrides):
+        parsed = cls.parser.parse_args(args or None)
+        values = dict(**parsed.__dict__)
+        values.update(overrides)
+        return values
+    
+    @classmethod
+    def main(cls, *args, **overrides):
+        values = cls.parse(*args, **overrides)
+        app = cls(**values)
+        return app.run() or 0
+    
 
 
+if __name__ == '__main__':
+    sys.exit(HaikuFinder.main())
 
diff --git a/crisishaiku/__init__.py b/crisishaiku/__init__.py
index e69de29..66cec86 100644
--- a/crisishaiku/__init__.py
+++ b/crisishaiku/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__version__ = '0.0.1'
+VERSION = tuple(map(int, __version__.split('.')))
+
diff --git a/crisishaiku/cli/__init__.py b/crisishaiku/cli/__init__.py
new file mode 100644
index 0000000..3a1213c
--- /dev/null
+++ b/crisishaiku/cli/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
diff --git a/crisishaiku/cli/pathtype.py b/crisishaiku/cli/pathtype.py
new file mode 100644
index 0000000..9ff03e5
--- /dev/null
+++ b/crisishaiku/cli/pathtype.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys, codecs, locale
+import argparse
+from path import path
+
+
+__all__ = ('FileType', 'PathType', 'DirectoryType', 'PathTypeError',)
+
+
+class PathTypeError(TypeError):
+    """ TypeError that provides `path` and `type` attributes tracking expectations. """
+    
+    def __init__(self, message, filepath, pathtype):
+        super(PathTypeError, self).__init__(message, filepath, pathtype)
+        self.message = message
+        self.path    = filepath
+        self.type    = pathtype
+
+
+
+class FileType(argparse.FileType):
+    """Factory for creating file object types
+    
+    Instances of FileType are typically passed as type= arguments to the
+    ArgumentParser add_argument() method.
+    
+    Keyword Arguments:
+        - mode='r' -- A string indicating how the file is to be opened. Accepts the
+            same values as the builtin open() function.
+        - encoding=None -- The file's encoding. None is treated as per the `codecs`
+            module (as bytes).
+        - errors='strict' -- Error handling as defined in the `codecs` module:
+            'strict', 'ignore', 'replace', 'xmlcharrefreplace', 'backslashreplace'
+        - bufsize=-1 -- The file's desired buffer size. Accepts the same values as
+            the builtin open() function.
+    """
+    
+    def __init__(self, mode='r', encoding=None, errors='strict', bufsize=-1):
+        self._mode     = mode
+        self._encoding = encoding
+        self._errors   = errors
+        self._bufsize  = bufsize
+    
+    def __call__(self, f):
+        mode = self._mode
+        enc = self._encoding
+        
+        # the special path "-" means sys.std{in,out}
+        if f == '-':
+            if 'r' in mode:
+                f = '/dev/stdin'
+                enc = enc or sys.stdin.encoding or locale.getpreferredencoding().lower()
+            elif 'w' in mode:
+                f = '/dev/stdout'
+                enc = enc or sys.stdout.encoding or locale.getpreferredencoding().lower()
+            else:
+                msg = _('argument "-" with mode %r') % mode
+                raise ValueError(msg)
+        
+        # all other paths are used as ... paths
+        try:
+            return codecs.open( f, mode=mode, encoding=enc or None,
+                errors=self._errors, buffering=self._bufsize )
+        except IOError as e:
+            message = _("can't open '%s': %s")
+            raise ArgumentTypeError(message % (f, e))
+    
+    def __repr__(self):
+        args = self._mode, self._encoding, self._errors, self._bufsize
+        args_str = ', '.join(repr(arg) for arg in args if arg != -1)
+        return '%s(%s)' % (type(self).__name__, args_str)
+
+
+
+class PathType(object):
+    """ Factory for validating a path and wrapping it as a `path`.
+        
+        Keyword Arguments:
+            - base=u'' -- Base path to resolve the passed path from.
+            - mustExist=False -- Validate directory exists, raising OSError otherwise.
+            - expand=True -- Expand the path.
+            - abspath=False -- Resolve the absolute path.
+    """
+    base      = u''
+    mustExist = True
+    expand    = True
+    abspath   = False
+    
+    
+    def __init__(self, base=u'', mustExist=True, expand=True, abspath=False):
+        self.base      = path(base)
+        self.mustExist = mustExist
+        self.expand    = expand
+        self.abspath   = abspath
+    
+    
+    def checkExists(self, p):
+        if self.mustExist and not p.exists():
+            raise OSError(2, 'No such file or directory', p)
+        return p
+    
+    def __call__(self, p):
+        p = self.base/p
+        if self.expand:
+            p = p.expand()
+        if self.abspath():
+            p = p.abspath()
+        return self.checkExists(p)
+    
+    
+    def __repr__(self):
+        return "%s(%s)" % ( type(self).__name__, 
+            ', '.join( '%s=%r' % (k,v) for k,v in self.__dict__.items() if not k[0] == '_' ) )
+    
+
+
+class DirectoryType(PathType):
+    """ Factory for validating a directory path and wrapping it as a `path`.
+    """
+    mkdirs = True
+    
+    
+    def __init__(self, base=u'', mkdirs=True, mustExist=False, expand=True, abspath=False):
+        """ Factory for validating a directory path and wrapping it as a `path`. If a given
+            path is not a directory, TypeError is raised.
+            
+            Keyword Arguments:
+                - base=u'' -- Base path to resolve the passed path from.
+                - mkdirs=True -- If directory does not exist, make it and all intermediary
+                    directories.
+                - mustExist=False -- Validate directory exists, raising OSError otherwise.
+                - expand=True -- Expand the path.
+                - abspath=False -- Resolve the absolute path.
+        """
+        super(DirectoryType, self).__init__(base, mustExist, expand, abspath)
+        self.mkdirs = mkdirs
+    
+    
+    def checkExists(self, p):
+        if self.mkdirs and not p.exists():
+            p.makedirs()
+        if p.exists() and not p.isdir():
+            raise PathTypeError('Path is not a directory', p, self)
+        return super(PathType, self).checkExists(p)
+    
+
+
diff --git a/data/fcic.txt b/data/fcir.txt
similarity index 100%
rename from data/fcic.txt
rename to data/fcir.txt
diff --git a/setup.py b/setup.py
index ddd0fb4..f32cc68 100644
--- a/setup.py
+++ b/setup.py
@@ -23,8 +23,11 @@ setup(
     # entry_points     = { 'console_scripts':['crisishaiku = crisishaiku:CrisisHaiku.main'] },
     
     install_requires = [
-        "bunch  >= 1.0",
-        "PyYAML >= 3.10",
+        'bunch    >= 1.0',
+        'PyYAML   >= 3.10',
+        'jsonlib2 >= 1.5.2',
+        'anyjson  >= 0.3.1',
+        'PyHyphen >= 1.0beta1',
     ],
     
     keywords         = 'crisishaiku crisis haiku poetry web',