From: dsc Date: Fri, 1 Jun 2012 12:10:19 +0000 (+0200) Subject: It works. X-Git-Url: http://git.less.ly:3516/?a=commitdiff_plain;h=9644e7446f44a35712a811a6446c1bd3f900224d;p=nginx2csv.git It works. --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a9a5aec --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tmp diff --git a/nginx2csv/__init__.py b/nginx2csv/__init__.py index d09f390..7f1cfe2 100755 --- a/nginx2csv/__init__.py +++ b/nginx2csv/__init__.py @@ -6,6 +6,12 @@ VERSION = tuple(map(int, __version__.split('.'))) import sys, re, fileinput, argparse from datetime import datetime +from urlparse import urlparse +import yaml +from bunch import * +from nest import Nest + + ### Example from logs: # 91.121.211.71 - - [02/May/2012:07:20:28 +0000] "GET / HTTP/1.1" 200 1600 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 1.1.4322)" @@ -16,14 +22,14 @@ from datetime import datetime # '"$http_referer" "$http_user_agent"'; PATTERNS = { - 'ip' : r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', - 'user' : r'([^\s]*)', - 'time' : r'([^\]]+?)', + 'ip' : r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', + 'user' : r'[^\s]*', + 'time' : r'[^\]]+?', 'request' : r'([A-Z]+)[ \t]+([^\s"]+)[ \t]+([^"]*)', - 'status' : r'(\d{2,})', - 'bytes' : r'(\d+)', - 'referrer' : r'([^"]+)', - 'user_agent' : r'([^"]+)', + 'status' : r'\d{2,}', + 'bytes' : r'\d+', + 'referrer' : r'[^"]+', + 'user_agent' : r'[^"]+', } DATE_KEYS = ['time'] ALIASES = { @@ -41,9 +47,10 @@ for alias, target in ALIASES.items(): # ugh TZ_PAT = re.compile(r'[ \t]+[\+\-]\d{4}[ \t]*$') +CONTENT_TYPE_PAT = re.compile(r'\.(\w+)$') # And finally build everything to named regex captures -PATTERNS = dict( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() ) +PATTERNS = Bunch( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() ) # Tokenizer for resolving nginx format @@ -52,7 +59,7 @@ TOKENIZER = re.compile(r'([^\$]*)\$([a-zA-Z_]+)(.*)') class NginxLogParser(object): "Parse an nginx log file line by line." log_format = '$ip - $user [$time] "$request" $status $bytes "$referer" "$user_agent"' - date_format = '%d/%b/%Y:%I:%M:%S' + date_format = '%d/%b/%Y:%H:%M:%S' def __init__(self, log_format=None, date_format=None): @@ -77,12 +84,21 @@ class NginxLogParser(object): def parse(self, line): "Parse a logline and return the dict of fields." m = self.compiled_format.match(line) - data = m.groupdict() if m else {} + data = Bunch(m.groupdict() if m else {}) for k in DATE_KEYS: if k not in data: continue v = TZ_PAT.sub('', data[k]) data[k] = datetime.strptime(v, self.date_format) - return data + if 'time' in data: + data.date = data.time.date() + if 'request' in data: + data.url = re.match( PATTERNS.request, data.request ).expand(r'\3') + url = urlparse(data.url) + for k in ('path', 'query', 'params', 'fragment'): + data[k] = getattr(url, k) + m = CONTENT_TYPE_PAT.search(data.path) + data.content_type = m.expand(r'\1') if m else 'html' + return dict(data) @@ -93,9 +109,38 @@ class NginxToCsvScript(object): self.__dict__.update(**args) self.__args__ = args + def filter(self, req): + path = req.get('path', '') + lc_path = path.lower() + return ( req and path and + req['content_type'] == 'html' and + path != '/datasources/all' and + not path.startswith('/wiki') and + all( tok not in lc_path for tok in self.exclude ) + ) + def __call__(self): self.logParser = NginxLogParser(self.log_format) - self.data = [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ] + self.data = filter(self.filter, [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ]) + self.byDay = (Nest() + # .key('date') + .key( lambda d: d['date'].strftime('%Y/%m/%d') ) + .sortKeys() + .key('path') + .rollup(len) + .map(self.data)) + for day, byUrl in self.byDay.iteritems(): + byUrl['TOTAL'] = sum(byUrl.values()) + # print yaml.safe_dump(self.byDay, indent=4, default_flow_style=False) + + # Write Headers + self.outfile.write('Date,Views\n') + for day, byUrl in self.byDay.iteritems(): + self.outfile.write('%s,%s\n' % (day, byUrl['TOTAL'])) + self.outfile.write('\n') + + + parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--version', action='version', version='.'.join(map(str,__version__))) @@ -103,12 +148,12 @@ class NginxToCsvScript(object): help='Log formatting string to parse. [default: %(default)s]') parser.add_argument('-d', '--date_format', default=NginxLogParser.date_format, help='strptime date-formatting string for parsing dates. [default: %(default)s]') - parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.') + parser.add_argument('-x', '--exclude', action='append', default=['php', 'fck', 'w00t', 'webhook'], + help='Log lines whose path contains any of these (case-insensitive) substrings are dropped.') + # parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.') parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) parser.add_argument('logfiles', nargs='*', default=['-']) - - @classmethod def parse(cls, *args, **overrides): parsed = cls.parser.parse_args(args or None) diff --git a/nginx2csv/nest.py b/nginx2csv/nest.py index bd79f06..1d9d581 100644 --- a/nginx2csv/nest.py +++ b/nginx2csv/nest.py @@ -65,7 +65,7 @@ The nested form allows easy iteration and generation of hierarchical structures """ -from collections import defaultdict, namedtuple +from collections import defaultdict, namedtuple, OrderedDict from operator import itemgetter, attrgetter Entry = namedtuple('Entry', 'key values') @@ -171,11 +171,16 @@ class Nest(object): return data values = defaultdict(list) - for v in data: + it = data.iteritems() if isinstance(data, dict) else enumerate(data) + for i, v in it: k = self._keys[depth](v) values[k].append(v) - return dict( (k, self.map(values.get(k), depth+1)) for k in values ) + keys = values.keys() + if self._sortKeys[depth]: + keys = sorted(keys, **self._sortKeys[depth]) + + return OrderedDict( (k, self.map(values.get(k), depth+1)) for k in keys ) def _entries(self, data, depth=0): @@ -189,6 +194,12 @@ class Nest(object): # Remove `cmp` if it exists, wrapping it to pluck the key from the entry-tuple propCmp = keySort.pop('cmp', cmp) # Then apply the sort using the rest of the specified settings + # def sorter(a,b): + # ret = propCmp(a['key'], b['key']) + # sign = '<' if ret < 0 else ('>' if ret > 0 else '=') + # print '%s %s %s' % (a['key'], sign, b['key']) + # return ret + # values = sorted(values, cmp=sorter, **keySort) values = sorted(values, cmp=lambda a, b: propCmp(a['key'], b['key']), **keySort) return values diff --git a/setup.py b/setup.py index 3a68d7d..5129b8a 100644 --- a/setup.py +++ b/setup.py @@ -26,12 +26,12 @@ setup( author_email = 'dsc@less.ly', packages = find_packages(), - entry_points = { 'console_scripts':['nginx2csv = nginx2csv:main'] }, + entry_points = { 'console_scripts':['nginx2csv = nginx2csv:NginxToCsvScript.main'] }, - # install_requires = [ - # "bunch >= 1.0", - # "PyYAML >= 3.10", - # ], + install_requires = [ + "bunch >= 1.0", + "PyYAML >= 3.10", + ], keywords = ['nginx', 'stats', 'csv'], classifiers = [