CSV parser, supporting transforms.

author dsc <dsc@wikimedia.org>

Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)

committer dsc <dsc@wikimedia.org>

Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)
author dsc <dsc@wikimedia.org>
Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)
committer dsc <dsc@wikimedia.org>
Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)
diff --git a/lib/util/csv.co b/lib/util/csv.co

new file mode 100644 (file)

index 0000000..cdab5ae
--- /dev/null
+++ b/lib/util/csv.co
@@ -0,0 +1,192 @@
+_  = require 'kraken/util/underscore'
+op = require 'kraken/util/op'
+
+
+DASH_PATTERN       = /-/g
+BLANK_LINE_PATTERN = /^(\s*)$/
+COMMENT_PATTERN    = /\s*(#|\/\/).*$/
+
+class CSVData
+    DEFAULT_OPTIONS :
+        colSep              : ','
+        rowSep              : '\n'
+        defaultType         : 'float'
+        customBars          : false
+        customSep           : ';'
+        errorBars           : false
+        fractions           : false
+        fractionSep         : '/'
+        skipBlankLines      : true
+        blankLinePat        : BLANK_LINE_PATTERN
+        removeCommentedText : true
+        commentPat          : COMMENT_PATTERN
+        replaceMissing      : false
+        replaceMissingValue : 0
+        replaceNaN          : false
+        replaceNaNValue     : 0
+        padRows             : false
+        padRowsValue        : 0
+    
+    options    : {}
+    
+    labels     : []
+    types      : []
+    
+    rawData     : null # String
+    origData    : null # row-oriented (untransformed)
+    data        : null # row-oriented
+    columns     : null # column-oriented (includes date column)
+    dateColumn  : null # only date column
+    dataColumns : null # column-oriented (excludes date column)
+    
+    transforms  : null # Array<Array<Function>> Row -> Col -> Stack of Transforms
+    
+    
+    (data, opts) ->
+        unless typeof data is 'string' or _.isArray data
+            [opts, data] = [data, null]
+        @options = _.clone(@DEFAULT_OPTIONS) import (opts or {})
+        # for k in @DEFAULT_OPTIONS then this[k] ?= @options[k]
+        @transforms = []
+        @labels  = @options.labels or []
+        @types   = @options.types  or []
+        @parse that if data or @options.data
+    
+    
+    /* * * *  Parsing  * * * */
+    
+    parseNumber: (s) ->
+        parseFloat s
+    
+    parseHiLo: (s) ->
+        s.split @options.customBars .map @parseNumber, this
+    
+    parseFraction: (s) ->
+        s.split @options.fractionSep .map @parseNumber, this
+    
+    parseDate: (s) ->
+        new Date s.replace DASH_PATTERN, '/'
+    
+    
+    parse: (@rawData) ->
+        o = @options
+        
+        lines = rawData.split o.rowSep
+        return [] unless lines.length
+        first = lines[0]
+        
+        # Use the default delimiter or fall back to a tab if that makes sense.
+        delim = o.colSep
+        if first.indexOf(delim) is -1 and first.indexOf('\t') >= 0
+            delim = '\t'
+        
+        data = @data = []
+        @columns     = []
+        @dataColumns = []
+        
+        parser = @parseNumber
+        parser = @parseHiLo     if o.customBars
+        parser = @parseFraction if o.fractions
+        
+        hasHeaders = @labels.length is not 0
+        for line, i of lines
+            line .= replace o.commentPat, '' if o.removeCommentedText
+            continue if o.skipBlankLines and (line.length is 0 or o.blankLinePat.test line)
+            
+            cols = line.split delim
+            unless hasHeaders
+                hasHeaders = true
+                @labels = cols.map -> _.strip it
+                continue
+            
+            continue unless cols.length > 1
+            date = @parseDate cols.shift()
+            fields = cols.map parser, this
+            if o.errorBars
+                fields = fields.reduce do
+                    (acc, v) ->
+                        last = acc[acc.length-1]
+                        unless last and last.length < 2
+                            acc.push last = []
+                        last.push v
+                        acc
+                    []
+            
+            fields.unshift date
+            data.push fields
+            fields.forEach (v, idx) ~>
+                @columns.push [] unless @columns[idx]
+                @columns[idx].push v
+        
+        @origData = _.merge [], @data
+        while @transforms.length < @columns.length
+            @transforms.push []
+        @dateColumn  = @columns[0]
+        @dataColumns = @columns.slice(1)
+        this
+    
+    
+    
+    /* * * *  Data Transformation  * * * */
+    
+    /**
+     * Rebuilds the row-oriented data matrix from the columns.
+     */
+    rebuildData: ->
+        @data = _.zip ...@columns
+        @dateColumn  = @columns[0]
+        @dataColumns = @columns.slice(1)
+        this
+    
+    /**
+     * Rebuilds the column-oriented data matrix from the columns.
+     */
+    rebuildColumns: ->
+        @columns = _.zip ...@data
+        @dateColumn  = @columns[0]
+        @dataColumns = @columns.slice(1)
+        this
+    
+    /**
+     * Map a function across the specified columns, one-by-one (in column-major 
+     * order), replacing the data with the mapped result.
+     * 
+     * @param {Number|Array} indices List one or more column indices to map. Negative
+     *  numbers are offset from the end of the columns list.
+     * @param {Function} fn Mapping function of the form:
+     *  `(single_value, row_idx, column) -> new_value`
+     * @param {Object} [ctx=this] Execution context.
+     * @returns {this} 
+     */
+    addTransform: (indices, fn, ctx=this) ->
+        num_cols = @columns.length
+        if typeof idx is 'function'
+            [ctx, fn, indices] = [fn, indices, null]
+        unless indices?
+            indices = _.range num_cols
+        unless _.isArray indices
+            indices = [indices]
+        for idx of indices
+            idx %= num_cols
+            idx += num_cols if idx < 0
+            @transforms[idx].push fn
+        @applyTransforms()
+    
+    addDataTransform: (fn, ctx=this) ->
+        @addTransform _.range(1, @columns.length), fn, ctx
+    
+    applyTransforms: ->
+        for fns, idx of @transforms
+            for fn of fns
+                @columns[idx] .= map fn, ctx
+        @rebuildData()
+    
+    clearTransforms: ->
+        @transforms = []
+        @data = _.merge [], @origData
+        @rebuildColumns()
+    
+
+module.exports = CSVData
+
+
diff --git a/lib/util/index.co b/lib/util/index.co

index 35cb297..e98d502 100644 (file)
--- a/lib/util/index.co
+++ b/lib/util/index.co
@@ -12,7 +12,8 @@ op        = require 'kraken/util/op'
 backbone  = require 'kraken/util/backbone'
 parser    = require 'kraken/util/parser'
 Cascade   = require 'kraken/util/cascade'
-exports import { root, _, op, backbone, parser, Cascade, }
+CSVData   = require 'kraken/util/csv'
+exports import { root, _, op, backbone, parser, Cascade, CSVData, }
 
 # HashSet   = require 'kraken/util/hashset'
 # BitString = require 'kraken/util/bitstring'
diff --git a/www/modules.yaml b/www/modules.yaml

index 1e965c5..1f46bcb 100644 (file)
--- a/www/modules.yaml
+++ b/www/modules.yaml
@@ -52,11 +52,13 @@ dev:
                 - backbone
                 - parser
                 - cascade
+                - csv
                 - index
             - base:
                 - base-mixin
                 - base-model
                 - base-view
+                - model-cache
                 - cascading-model
                 - index
             - scaffold:
author	dsc <dsc@wikimedia.org>
	Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)
committer	dsc <dsc@wikimedia.org>
	Mon, 16 Apr 2012 20:02:30 +0000 (13:02 -0700)
lib/util/csv.co	[new file with mode: 0644]	patch \| blob
lib/util/index.co		patch \| blob \| history
www/modules.yaml		patch \| blob \| history