From 57687bc6ee5cf06589b4347ed8c8079d08142fd9 Mon Sep 17 00:00:00 2001 From: dsc Date: Fri, 1 Jun 2012 10:48:33 +0200 Subject: [PATCH] Initial commit. --- README.md | 4 + nginx2csv/__init__.py | 129 ++++++++++++++++++++++++++++++ nginx2csv/nest.py | 211 +++++++++++++++++++++++++++++++++++++++++++++++++ sample.log | 10 +++ setup.py | 52 ++++++++++++ 5 files changed, 406 insertions(+), 0 deletions(-) create mode 100644 README.md create mode 100755 nginx2csv/__init__.py create mode 100644 nginx2csv/nest.py create mode 100644 sample.log create mode 100644 setup.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f0da351 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# nginx2csv + +Extract metrics from nginx logs. + diff --git a/nginx2csv/__init__.py b/nginx2csv/__init__.py new file mode 100755 index 0000000..d09f390 --- /dev/null +++ b/nginx2csv/__init__.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__version__ = '0.1.0' +VERSION = tuple(map(int, __version__.split('.'))) + +import sys, re, fileinput, argparse +from datetime import datetime + +### Example from logs: +# 91.121.211.71 - - [02/May/2012:07:20:28 +0000] "GET / HTTP/1.1" 200 1600 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 1.1.4322)" + +### (nginx) There is a predefined log format called "combined": +# log_format combined '$remote_addr - $remote_user [$time_local] ' +# '"$request" $status $body_bytes_sent ' +# '"$http_referer" "$http_user_agent"'; + +PATTERNS = { + 'ip' : r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', + 'user' : r'([^\s]*)', + 'time' : r'([^\]]+?)', + 'request' : r'([A-Z]+)[ \t]+([^\s"]+)[ \t]+([^"]*)', + 'status' : r'(\d{2,})', + 'bytes' : r'(\d+)', + 'referrer' : r'([^"]+)', + 'user_agent' : r'([^"]+)', +} +DATE_KEYS = ['time'] +ALIASES = { + 'remote_addr' : 'ip', + 'remote_user' : 'user', + 'time_local' : 'time', + 'body_bytes_sent' : 'bytes', + 'http_referer' : 'referrer', + 'referer' : 'referrer', + 'http_user_agent' : 'user_agent', +} +for alias, target in ALIASES.items(): + PATTERNS[alias] = PATTERNS[target] + if target in DATE_KEYS: DATE_KEYS.append(alias) + +# ugh +TZ_PAT = re.compile(r'[ \t]+[\+\-]\d{4}[ \t]*$') + +# And finally build everything to named regex captures +PATTERNS = dict( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() ) + + +# Tokenizer for resolving nginx format +TOKENIZER = re.compile(r'([^\$]*)\$([a-zA-Z_]+)(.*)') + +class NginxLogParser(object): + "Parse an nginx log file line by line." + log_format = '$ip - $user [$time] "$request" $status $bytes "$referer" "$user_agent"' + date_format = '%d/%b/%Y:%I:%M:%S' + + + def __init__(self, log_format=None, date_format=None): + self.log_format = log_format or self.log_format + self.date_format = date_format or self.date_format + self.compiled_format = re.compile( self.formatToRegex(self.log_format) ) + + def formatToRegex(self, format): + "Convert an nginx-style formatting string into a regex." + resolved = '' + while format: + m = TOKENIZER.match(format) + if m: + raw, token, format = m.groups() + else: + raw, token, format = (format, '', '') + raw, n = re.subn( r'([^ \t]+)', lambda m: re.escape(m.groups(1)[0]), raw ) + raw, n = re.subn( r'[ \t]+', r'[ \t]+', raw ) + resolved += raw + PATTERNS.get(token, token) + return resolved + + def parse(self, line): + "Parse a logline and return the dict of fields." + m = self.compiled_format.match(line) + data = m.groupdict() if m else {} + for k in DATE_KEYS: + if k not in data: continue + v = TZ_PAT.sub('', data[k]) + data[k] = datetime.strptime(v, self.date_format) + return data + + + +class NginxToCsvScript(object): + "Parse nginx logs into a CSV file." + + def __init__(self, **args): + self.__dict__.update(**args) + self.__args__ = args + + def __call__(self): + self.logParser = NginxLogParser(self.log_format) + self.data = [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ] + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--version', action='version', version='.'.join(map(str,__version__))) + parser.add_argument('-f', '--log_format', default=NginxLogParser.log_format, + help='Log formatting string to parse. [default: %(default)s]') + parser.add_argument('-d', '--date_format', default=NginxLogParser.date_format, + help='strptime date-formatting string for parsing dates. [default: %(default)s]') + parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.') + parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) + parser.add_argument('logfiles', nargs='*', default=['-']) + + + + @classmethod + def parse(cls, *args, **overrides): + parsed = cls.parser.parse_args(args or None) + values = dict(**parsed.__dict__) + values.update(overrides) + return cls(**values) + + @classmethod + def main(cls, *args, **overrides): + app = cls.parse(*args, **overrides) + return app() + + + +if __name__ == '__main__': + sys.exit(NginxToCsvScript.main() or 0) + + diff --git a/nginx2csv/nest.py b/nginx2csv/nest.py new file mode 100644 index 0000000..bd79f06 --- /dev/null +++ b/nginx2csv/nest.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Original `nest` operator from d3.js, by Mike Bostock +# https://github.com/mbostock/d3/wiki/Arrays#wiki-d3_nest +# +# Ported to Python by David Schoonover + +""" +Nesting allows elements in an array to be grouped into a hierarchical tree structure; +think of it like the GROUP BY operator in SQL, except you can have multiple levels of +grouping, and the resulting output is a tree rather than a flat table. The levels in +the tree are specified by key functions. The leaf nodes of the tree can be sorted by +value, while the internal nodes can be sorted by key. An optional rollup function will +collapse the elements in each leaf node using a summary function. The nest operator +(the object returned by d3.nest) is reusable, and does not retain any references to the +data that is nested. + +For example, consider the following tabular data structure of Barley yields, from +various sites in Minnesota during 1931-2: + + yields = [ + {"yield": 27.00, "variety": "Manchuria", "year": 1931, "site": "University Farm"}, + {"yield": 48.87, "variety": "Manchuria", "year": 1931, "site": "Waseca"}, + {"yield": 27.43, "variety": "Manchuria", "year": 1931, "site": "Morris"}, + {"yield": 43.07, "variety": "Glabron", "year": 1931, "site": "University Farm"}, + {"yield": 55.20, "variety": "Glabron", "year": 1931, "site": "Waseca"}, + {"yield": 16.18, "variety": "Glabron", "year": 1932, "site": "University Farm"}, + ] + +To facilitate visualization, it may be useful to nest the elements first by year, and then by variety, as follows: + + Nest() + .key( lambda d: d['year'] ) + .key( lambda d: d['variety'] ) + .entries(yields) + +Or more concisely: + + Nest() + .key('year') + .key('variety') + .entries(yields) + +...as both the `key` and `prop` functions will interpret non-callables as they key to look up. + +This returns a nested array. Each element of the outer array is a key-values pair, listing the values for each distinct key: + + [ {"key": 1931, "values": [ + {"key": "Manchuria", "values": [ + {"yield": 27.00, "variety": "Manchuria", "year": 1931, "site": "University Farm"}, + {"yield": 48.87, "variety": "Manchuria", "year": 1931, "site": "Waseca"}, + {"yield": 27.43, "variety": "Manchuria", "year": 1931, "site": "Morris"}, ]}, + {"key": "Glabron", "values": [ + {"yield": 43.07, "variety": "Glabron", "year": 1931, "site": "University Farm"}, + {"yield": 55.20, "variety": "Glabron", "year": 1931, "site": "Waseca"}, ]}, + ]}, + {"key": 1932, "values": [ + {"key": "Glabron", "values": [ + {"yield": 16.18, "variety": "Glabron", "year": 1932, "site": "University Farm"}, ]}, + ]}, + ] + +The nested form allows easy iteration and generation of hierarchical structures in SVG or HTML. + +""" + +from collections import defaultdict, namedtuple +from operator import itemgetter, attrgetter + +Entry = namedtuple('Entry', 'key values') + + + +class Nest(object): + "Array nesting operator." + _keys = None + _sortKeys = None + _sortValues = None + _rollup = None + + + def __init__(self): + """ Creates a new nest operator. The set of keys is initially empty. If the map + or entries operator is invoked before any key functions are registered, the + nest operator simply returns the input array. + """ + self._keys = [] + self._sortKeys = [] + + + def key(self, fn): + """ Registers a new key function. The key function will be invoked for each + element in the input array, and must return a string identifier that is used to + assign the element to its group. As most often the key function is just a + simple accessor, this fuction also accepts a non-callable, which will be converted + into a function that simply performs a dictionary lookup via `operator.itemgetter` + (constrast this with `prop()`, which simply uses attribute lookup via + `operator.attrgetter`). + + Each time a key is registered, it is appended to the end of an internal keys + array, and the resulting map or entries will have an additional hierarchy + level. There is not currently a facility to remove or query the registered + keys. The most-recently registered key is referred to as the current key in + subsequent methods. + """ + if not callable(fn): + fn = itemgetter(fn) + self._keys.append(fn) + self._sortKeys.append({}) + return self + + + def prop(self, fn): + """ Registers a new key function. The key function will be invoked for each + element in the input array, and must return a string identifier that is used to + assign the element to its group. As most often the key function is just a + simple accessor, this fuction also accepts a non-callable, which will be converted + into a function that simply performs an attribute lookup via `operator.attrgetter` + (constrast this with `key()`, which simply uses a dictionary lookup via + `operator.itemgetter`). + + Each time a key is registered, it is appended to the end of an internal keys + array, and the resulting map or entries will have an additional hierarchy + level. There is not currently a facility to remove or query the registered + keys. The most-recently registered key is referred to as the current key in + subsequent methods. + """ + if not callable(fn): + fn = attrgetter(fn) + self._keys.append(fn) + self._sortKeys.append({}) + return self + + + def sortKeys(self, cmp=None, key=None, reverse=False): + """ Specifies the order for the most-recently specified key. + """ + self._sortKeys[-1] = dict(cmp=cmp, key=key, reverse=reverse) + return self + + + def sortValues(self, cmp=None, key=None, reverse=False): + """ Specifies the order for leaf values; applies to both maps and entries array. + """ + self._sortValues = dict(cmp=cmp, key=key, reverse=reverse) + return self + + + def rollup(self, fn): + """ Specifies a rollup function to be applied on each group of leaf elements. + The return value of the rollup function will replace the array of leaf values + in either the associative array returned by the map operator, or the values + attribute of each entry returned by the entries operator. + """ + self._rollup = fn + return self + + + def map(self, data, depth=0): + """ Applies the nest operator to the specified array, returning an array of + key-values entries. Each entry in the returned array corresponds to a distinct + key value returned by the first key function. The entry value depends on the + number of registered key functions: if there is an additional key, the value is + another nested array of entries; otherwise, the value is the array of elements + filtered from the input array that have the given key value. + """ + if depth >= len(self._keys): + if self._rollup: return self._rollup(data) + if self._sortValues: return sorted(data, **self._sortValues) + return data + + values = defaultdict(list) + for v in data: + k = self._keys[depth](v) + values[k].append(v) + + return dict( (k, self.map(values.get(k), depth+1)) for k in values ) + + + def _entries(self, data, depth=0): + if depth >= len(self._keys): + return data + + values = [ Entry(k, self._entries(v, depth+1)) for k, v in data.iteritems() ] + + keySort = self._sortKeys[depth] + if keySort: + # Remove `cmp` if it exists, wrapping it to pluck the key from the entry-tuple + propCmp = keySort.pop('cmp', cmp) + # Then apply the sort using the rest of the specified settings + values = sorted(values, cmp=lambda a, b: propCmp(a['key'], b['key']), **keySort) + + return values + + + def entries(self, data, depth=0): + """ Applies the nest operator to the specified array, returning an associative + array. Each Entry (a named tuple with the fields 'key' and 'values') in the + returned associative array corresponds to a distinct key-value pair + returned by the first key-function. The entry value depends on the number + of registered key functions: if there is an additional key, the value is + another nested associative array; otherwise, the value is the array of + elements filtered from the input array that have the given key value. + """ + return self._entries(self.map(data)) + + def __len__(self): + return len(self._keys) + + diff --git a/sample.log b/sample.log new file mode 100644 index 0000000..5b4f862 --- /dev/null +++ b/sample.log @@ -0,0 +1,10 @@ +62.141.45.126 - - [31/May/2012:11:05:39 +0000] "GET / HTTP/1.1" 200 0 "-" "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt; DTS Agent" +189.43.204.138 - - [31/May/2012:11:06:44 +0000] "-" 400 0 "-" "-" +195.239.178.205 - - [31/May/2012:11:48:33 +0000] "GET /robots.txt HTTP/1.0" 404 168 "-" "Mozilla/5.0 (compatible; Nigma.ru/3.0; crawler@nigma.ru)" +195.239.178.205 - - [31/May/2012:11:48:34 +0000] "GET / HTTP/1.0" 200 1597 "-" "Mozilla/5.0 (compatible; Nigma.ru/3.0; crawler@nigma.ru)" +193.17.253.13 - - [31/May/2012:11:53:51 +0000] "GET / HTTP/1.1" 200 0 "-" "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt; DTS Agent" +220.57.62.78 - - [31/May/2012:11:56:07 +0000] "GET / HTTP/1.1" 200 1597 "http://reportcard.wmflabs.org/" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" +118.192.35.176 - - [31/May/2012:12:01:38 +0000] "GET / HTTP/1.1" 200 1597 "http://reportcard.wmflabs.org/" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" +91.193.130.5 - - [31/May/2012:12:17:59 +0000] "GET / HTTP/1.1" 200 535 "http://wikimediafoundation.org/wiki/Home" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)" +90.52.169.200 - - [31/May/2012:12:37:21 +0000] "GET / HTTP/1.1" 200 535 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7" +180.76.5.178 - - [31/May/2012:12:38:05 +0000] "GET / HTTP/1.1" 200 1597 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3a68d7d --- /dev/null +++ b/setup.py @@ -0,0 +1,52 @@ +#!python +# -*- coding: utf-8 -*- +import sys, os, re +from os.path import dirname, abspath, join +from setuptools import setup, find_packages + +HERE = abspath(dirname(__file__)) +readme = open(join(HERE, 'README.md'), 'rU').read() + +package_file = open(join(HERE, 'nginx2csv/__init__.py'), 'rU') +__version__ = re.sub( + r".*\b__version__\s+=\s+'([^']+)'.*", + r'\1', + [ line.strip() for line in package_file if '__version__' in line ].pop(0) +) + + +setup( + name = 'nginx2csv', + version = __version__, + description = 'Create stats from nginx log files.', + long_description = readme, + url = 'http://git.less.ly/?p=nginx2csv.git', + + author = 'David Schoonover', + author_email = 'dsc@less.ly', + + packages = find_packages(), + entry_points = { 'console_scripts':['nginx2csv = nginx2csv:main'] }, + + # install_requires = [ + # "bunch >= 1.0", + # "PyYAML >= 3.10", + # ], + + keywords = ['nginx', 'stats', 'csv'], + classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Topic :: Utilities" + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Operating System :: OS Independent", + "License :: OSI Approved :: MIT License", + ], + zip_safe = False, + license = "MIT", +) + -- 1.7.0.4