--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__version__ = '0.1.0'
+VERSION = tuple(map(int, __version__.split('.')))
+
+import sys, re, fileinput, argparse
+from datetime import datetime
+
+### Example from logs:
+# 91.121.211.71 - - [02/May/2012:07:20:28 +0000] "GET / HTTP/1.1" 200 1600 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 1.1.4322)"
+
+### (nginx) There is a predefined log format called "combined":
+# log_format combined '$remote_addr - $remote_user [$time_local] '
+# '"$request" $status $body_bytes_sent '
+# '"$http_referer" "$http_user_agent"';
+
+PATTERNS = {
+ 'ip' : r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
+ 'user' : r'([^\s]*)',
+ 'time' : r'([^\]]+?)',
+ 'request' : r'([A-Z]+)[ \t]+([^\s"]+)[ \t]+([^"]*)',
+ 'status' : r'(\d{2,})',
+ 'bytes' : r'(\d+)',
+ 'referrer' : r'([^"]+)',
+ 'user_agent' : r'([^"]+)',
+}
+DATE_KEYS = ['time']
+ALIASES = {
+ 'remote_addr' : 'ip',
+ 'remote_user' : 'user',
+ 'time_local' : 'time',
+ 'body_bytes_sent' : 'bytes',
+ 'http_referer' : 'referrer',
+ 'referer' : 'referrer',
+ 'http_user_agent' : 'user_agent',
+}
+for alias, target in ALIASES.items():
+ PATTERNS[alias] = PATTERNS[target]
+ if target in DATE_KEYS: DATE_KEYS.append(alias)
+
+# ugh
+TZ_PAT = re.compile(r'[ \t]+[\+\-]\d{4}[ \t]*$')
+
+# And finally build everything to named regex captures
+PATTERNS = dict( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() )
+
+
+# Tokenizer for resolving nginx format
+TOKENIZER = re.compile(r'([^\$]*)\$([a-zA-Z_]+)(.*)')
+
+class NginxLogParser(object):
+ "Parse an nginx log file line by line."
+ log_format = '$ip - $user [$time] "$request" $status $bytes "$referer" "$user_agent"'
+ date_format = '%d/%b/%Y:%I:%M:%S'
+
+
+ def __init__(self, log_format=None, date_format=None):
+ self.log_format = log_format or self.log_format
+ self.date_format = date_format or self.date_format
+ self.compiled_format = re.compile( self.formatToRegex(self.log_format) )
+
+ def formatToRegex(self, format):
+ "Convert an nginx-style formatting string into a regex."
+ resolved = ''
+ while format:
+ m = TOKENIZER.match(format)
+ if m:
+ raw, token, format = m.groups()
+ else:
+ raw, token, format = (format, '', '')
+ raw, n = re.subn( r'([^ \t]+)', lambda m: re.escape(m.groups(1)[0]), raw )
+ raw, n = re.subn( r'[ \t]+', r'[ \t]+', raw )
+ resolved += raw + PATTERNS.get(token, token)
+ return resolved
+
+ def parse(self, line):
+ "Parse a logline and return the dict of fields."
+ m = self.compiled_format.match(line)
+ data = m.groupdict() if m else {}
+ for k in DATE_KEYS:
+ if k not in data: continue
+ v = TZ_PAT.sub('', data[k])
+ data[k] = datetime.strptime(v, self.date_format)
+ return data
+
+
+
+class NginxToCsvScript(object):
+ "Parse nginx logs into a CSV file."
+
+ def __init__(self, **args):
+ self.__dict__.update(**args)
+ self.__args__ = args
+
+ def __call__(self):
+ self.logParser = NginxLogParser(self.log_format)
+ self.data = [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ]
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('--version', action='version', version='.'.join(map(str,__version__)))
+ parser.add_argument('-f', '--log_format', default=NginxLogParser.log_format,
+ help='Log formatting string to parse. [default: %(default)s]')
+ parser.add_argument('-d', '--date_format', default=NginxLogParser.date_format,
+ help='strptime date-formatting string for parsing dates. [default: %(default)s]')
+ parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.')
+ parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout)
+ parser.add_argument('logfiles', nargs='*', default=['-'])
+
+
+
+ @classmethod
+ def parse(cls, *args, **overrides):
+ parsed = cls.parser.parse_args(args or None)
+ values = dict(**parsed.__dict__)
+ values.update(overrides)
+ return cls(**values)
+
+ @classmethod
+ def main(cls, *args, **overrides):
+ app = cls.parse(*args, **overrides)
+ return app()
+
+
+
+if __name__ == '__main__':
+ sys.exit(NginxToCsvScript.main() or 0)
+
+
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Original `nest` operator from d3.js, by Mike Bostock
+# https://github.com/mbostock/d3/wiki/Arrays#wiki-d3_nest
+#
+# Ported to Python by David Schoonover <dsc@less.ly>
+
+"""
+Nesting allows elements in an array to be grouped into a hierarchical tree structure;
+think of it like the GROUP BY operator in SQL, except you can have multiple levels of
+grouping, and the resulting output is a tree rather than a flat table. The levels in
+the tree are specified by key functions. The leaf nodes of the tree can be sorted by
+value, while the internal nodes can be sorted by key. An optional rollup function will
+collapse the elements in each leaf node using a summary function. The nest operator
+(the object returned by d3.nest) is reusable, and does not retain any references to the
+data that is nested.
+
+For example, consider the following tabular data structure of Barley yields, from
+various sites in Minnesota during 1931-2:
+
+ yields = [
+ {"yield": 27.00, "variety": "Manchuria", "year": 1931, "site": "University Farm"},
+ {"yield": 48.87, "variety": "Manchuria", "year": 1931, "site": "Waseca"},
+ {"yield": 27.43, "variety": "Manchuria", "year": 1931, "site": "Morris"},
+ {"yield": 43.07, "variety": "Glabron", "year": 1931, "site": "University Farm"},
+ {"yield": 55.20, "variety": "Glabron", "year": 1931, "site": "Waseca"},
+ {"yield": 16.18, "variety": "Glabron", "year": 1932, "site": "University Farm"},
+ ]
+
+To facilitate visualization, it may be useful to nest the elements first by year, and then by variety, as follows:
+
+ Nest()
+ .key( lambda d: d['year'] )
+ .key( lambda d: d['variety'] )
+ .entries(yields)
+
+Or more concisely:
+
+ Nest()
+ .key('year')
+ .key('variety')
+ .entries(yields)
+
+...as both the `key` and `prop` functions will interpret non-callables as they key to look up.
+
+This returns a nested array. Each element of the outer array is a key-values pair, listing the values for each distinct key:
+
+ [ {"key": 1931, "values": [
+ {"key": "Manchuria", "values": [
+ {"yield": 27.00, "variety": "Manchuria", "year": 1931, "site": "University Farm"},
+ {"yield": 48.87, "variety": "Manchuria", "year": 1931, "site": "Waseca"},
+ {"yield": 27.43, "variety": "Manchuria", "year": 1931, "site": "Morris"}, ]},
+ {"key": "Glabron", "values": [
+ {"yield": 43.07, "variety": "Glabron", "year": 1931, "site": "University Farm"},
+ {"yield": 55.20, "variety": "Glabron", "year": 1931, "site": "Waseca"}, ]},
+ ]},
+ {"key": 1932, "values": [
+ {"key": "Glabron", "values": [
+ {"yield": 16.18, "variety": "Glabron", "year": 1932, "site": "University Farm"}, ]},
+ ]},
+ ]
+
+The nested form allows easy iteration and generation of hierarchical structures in SVG or HTML.
+
+"""
+
+from collections import defaultdict, namedtuple
+from operator import itemgetter, attrgetter
+
+Entry = namedtuple('Entry', 'key values')
+
+
+
+class Nest(object):
+ "Array nesting operator."
+ _keys = None
+ _sortKeys = None
+ _sortValues = None
+ _rollup = None
+
+
+ def __init__(self):
+ """ Creates a new nest operator. The set of keys is initially empty. If the map
+ or entries operator is invoked before any key functions are registered, the
+ nest operator simply returns the input array.
+ """
+ self._keys = []
+ self._sortKeys = []
+
+
+ def key(self, fn):
+ """ Registers a new key function. The key function will be invoked for each
+ element in the input array, and must return a string identifier that is used to
+ assign the element to its group. As most often the key function is just a
+ simple accessor, this fuction also accepts a non-callable, which will be converted
+ into a function that simply performs a dictionary lookup via `operator.itemgetter`
+ (constrast this with `prop()`, which simply uses attribute lookup via
+ `operator.attrgetter`).
+
+ Each time a key is registered, it is appended to the end of an internal keys
+ array, and the resulting map or entries will have an additional hierarchy
+ level. There is not currently a facility to remove or query the registered
+ keys. The most-recently registered key is referred to as the current key in
+ subsequent methods.
+ """
+ if not callable(fn):
+ fn = itemgetter(fn)
+ self._keys.append(fn)
+ self._sortKeys.append({})
+ return self
+
+
+ def prop(self, fn):
+ """ Registers a new key function. The key function will be invoked for each
+ element in the input array, and must return a string identifier that is used to
+ assign the element to its group. As most often the key function is just a
+ simple accessor, this fuction also accepts a non-callable, which will be converted
+ into a function that simply performs an attribute lookup via `operator.attrgetter`
+ (constrast this with `key()`, which simply uses a dictionary lookup via
+ `operator.itemgetter`).
+
+ Each time a key is registered, it is appended to the end of an internal keys
+ array, and the resulting map or entries will have an additional hierarchy
+ level. There is not currently a facility to remove or query the registered
+ keys. The most-recently registered key is referred to as the current key in
+ subsequent methods.
+ """
+ if not callable(fn):
+ fn = attrgetter(fn)
+ self._keys.append(fn)
+ self._sortKeys.append({})
+ return self
+
+
+ def sortKeys(self, cmp=None, key=None, reverse=False):
+ """ Specifies the order for the most-recently specified key.
+ """
+ self._sortKeys[-1] = dict(cmp=cmp, key=key, reverse=reverse)
+ return self
+
+
+ def sortValues(self, cmp=None, key=None, reverse=False):
+ """ Specifies the order for leaf values; applies to both maps and entries array.
+ """
+ self._sortValues = dict(cmp=cmp, key=key, reverse=reverse)
+ return self
+
+
+ def rollup(self, fn):
+ """ Specifies a rollup function to be applied on each group of leaf elements.
+ The return value of the rollup function will replace the array of leaf values
+ in either the associative array returned by the map operator, or the values
+ attribute of each entry returned by the entries operator.
+ """
+ self._rollup = fn
+ return self
+
+
+ def map(self, data, depth=0):
+ """ Applies the nest operator to the specified array, returning an array of
+ key-values entries. Each entry in the returned array corresponds to a distinct
+ key value returned by the first key function. The entry value depends on the
+ number of registered key functions: if there is an additional key, the value is
+ another nested array of entries; otherwise, the value is the array of elements
+ filtered from the input array that have the given key value.
+ """
+ if depth >= len(self._keys):
+ if self._rollup: return self._rollup(data)
+ if self._sortValues: return sorted(data, **self._sortValues)
+ return data
+
+ values = defaultdict(list)
+ for v in data:
+ k = self._keys[depth](v)
+ values[k].append(v)
+
+ return dict( (k, self.map(values.get(k), depth+1)) for k in values )
+
+
+ def _entries(self, data, depth=0):
+ if depth >= len(self._keys):
+ return data
+
+ values = [ Entry(k, self._entries(v, depth+1)) for k, v in data.iteritems() ]
+
+ keySort = self._sortKeys[depth]
+ if keySort:
+ # Remove `cmp` if it exists, wrapping it to pluck the key from the entry-tuple
+ propCmp = keySort.pop('cmp', cmp)
+ # Then apply the sort using the rest of the specified settings
+ values = sorted(values, cmp=lambda a, b: propCmp(a['key'], b['key']), **keySort)
+
+ return values
+
+
+ def entries(self, data, depth=0):
+ """ Applies the nest operator to the specified array, returning an associative
+ array. Each Entry (a named tuple with the fields 'key' and 'values') in the
+ returned associative array corresponds to a distinct key-value pair
+ returned by the first key-function. The entry value depends on the number
+ of registered key functions: if there is an additional key, the value is
+ another nested associative array; otherwise, the value is the array of
+ elements filtered from the input array that have the given key value.
+ """
+ return self._entries(self.map(data))
+
+ def __len__(self):
+ return len(self._keys)
+
+
--- /dev/null
+#!python
+# -*- coding: utf-8 -*-
+import sys, os, re
+from os.path import dirname, abspath, join
+from setuptools import setup, find_packages
+
+HERE = abspath(dirname(__file__))
+readme = open(join(HERE, 'README.md'), 'rU').read()
+
+package_file = open(join(HERE, 'nginx2csv/__init__.py'), 'rU')
+__version__ = re.sub(
+ r".*\b__version__\s+=\s+'([^']+)'.*",
+ r'\1',
+ [ line.strip() for line in package_file if '__version__' in line ].pop(0)
+)
+
+
+setup(
+ name = 'nginx2csv',
+ version = __version__,
+ description = 'Create stats from nginx log files.',
+ long_description = readme,
+ url = 'http://git.less.ly/?p=nginx2csv.git',
+
+ author = 'David Schoonover',
+ author_email = 'dsc@less.ly',
+
+ packages = find_packages(),
+ entry_points = { 'console_scripts':['nginx2csv = nginx2csv:main'] },
+
+ # install_requires = [
+ # "bunch >= 1.0",
+ # "PyYAML >= 3.10",
+ # ],
+
+ keywords = ['nginx', 'stats', 'csv'],
+ classifiers = [
+ "Development Status :: 4 - Beta",
+ "Environment :: Console",
+ "Intended Audience :: Developers",
+ "Topic :: Utilities"
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2.6",
+ "Programming Language :: Python :: 2.7",
+ "Operating System :: OS Independent",
+ "License :: OSI Approved :: MIT License",
+ ],
+ zip_safe = False,
+ license = "MIT",
+)
+