import sys, re, fileinput, argparse
from datetime import datetime
+from urlparse import urlparse
+import yaml
+from bunch import *
+from nest import Nest
+
+
### Example from logs:
# 91.121.211.71 - - [02/May/2012:07:20:28 +0000] "GET / HTTP/1.1" 200 1600 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 1.1.4322)"
# '"$http_referer" "$http_user_agent"';
PATTERNS = {
- 'ip' : r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
- 'user' : r'([^\s]*)',
- 'time' : r'([^\]]+?)',
+ 'ip' : r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',
+ 'user' : r'[^\s]*',
+ 'time' : r'[^\]]+?',
'request' : r'([A-Z]+)[ \t]+([^\s"]+)[ \t]+([^"]*)',
- 'status' : r'(\d{2,})',
- 'bytes' : r'(\d+)',
- 'referrer' : r'([^"]+)',
- 'user_agent' : r'([^"]+)',
+ 'status' : r'\d{2,}',
+ 'bytes' : r'\d+',
+ 'referrer' : r'[^"]+',
+ 'user_agent' : r'[^"]+',
}
DATE_KEYS = ['time']
ALIASES = {
# ugh
TZ_PAT = re.compile(r'[ \t]+[\+\-]\d{4}[ \t]*$')
+CONTENT_TYPE_PAT = re.compile(r'\.(\w+)$')
# And finally build everything to named regex captures
-PATTERNS = dict( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() )
+PATTERNS = Bunch( (k, '(?P<%s>%s)' % (k, pat)) for k, pat in PATTERNS.items() )
# Tokenizer for resolving nginx format
class NginxLogParser(object):
"Parse an nginx log file line by line."
log_format = '$ip - $user [$time] "$request" $status $bytes "$referer" "$user_agent"'
- date_format = '%d/%b/%Y:%I:%M:%S'
+ date_format = '%d/%b/%Y:%H:%M:%S'
def __init__(self, log_format=None, date_format=None):
def parse(self, line):
"Parse a logline and return the dict of fields."
m = self.compiled_format.match(line)
- data = m.groupdict() if m else {}
+ data = Bunch(m.groupdict() if m else {})
for k in DATE_KEYS:
if k not in data: continue
v = TZ_PAT.sub('', data[k])
data[k] = datetime.strptime(v, self.date_format)
- return data
+ if 'time' in data:
+ data.date = data.time.date()
+ if 'request' in data:
+ data.url = re.match( PATTERNS.request, data.request ).expand(r'\3')
+ url = urlparse(data.url)
+ for k in ('path', 'query', 'params', 'fragment'):
+ data[k] = getattr(url, k)
+ m = CONTENT_TYPE_PAT.search(data.path)
+ data.content_type = m.expand(r'\1') if m else 'html'
+ return dict(data)
self.__dict__.update(**args)
self.__args__ = args
+ def filter(self, req):
+ path = req.get('path', '')
+ lc_path = path.lower()
+ return ( req and path and
+ req['content_type'] == 'html' and
+ path != '/datasources/all' and
+ not path.startswith('/wiki') and
+ all( tok not in lc_path for tok in self.exclude )
+ )
+
def __call__(self):
self.logParser = NginxLogParser(self.log_format)
- self.data = [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ]
+ self.data = filter(self.filter, [ self.logParser.parse(line.rstrip()) for line in fileinput.input(self.logfiles) if line.rstrip() ])
+ self.byDay = (Nest()
+ # .key('date')
+ .key( lambda d: d['date'].strftime('%Y/%m/%d') )
+ .sortKeys()
+ .key('path')
+ .rollup(len)
+ .map(self.data))
+ for day, byUrl in self.byDay.iteritems():
+ byUrl['TOTAL'] = sum(byUrl.values())
+ # print yaml.safe_dump(self.byDay, indent=4, default_flow_style=False)
+
+ # Write Headers
+ self.outfile.write('Date,Views\n')
+ for day, byUrl in self.byDay.iteritems():
+ self.outfile.write('%s,%s\n' % (day, byUrl['TOTAL']))
+ self.outfile.write('\n')
+
+
+
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--version', action='version', version='.'.join(map(str,__version__)))
help='Log formatting string to parse. [default: %(default)s]')
parser.add_argument('-d', '--date_format', default=NginxLogParser.date_format,
help='strptime date-formatting string for parsing dates. [default: %(default)s]')
- parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.')
+ parser.add_argument('-x', '--exclude', action='append', default=['php', 'fck', 'w00t', 'webhook'],
+ help='Log lines whose path contains any of these (case-insensitive) substrings are dropped.')
+ # parser.add_argument('--json', action='store_true', default=False, help='Emit JSON data, not CSV.')
parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('logfiles', nargs='*', default=['-'])
-
-
@classmethod
def parse(cls, *args, **overrides):
parsed = cls.parser.parse_args(args or None)
"""
-from collections import defaultdict, namedtuple
+from collections import defaultdict, namedtuple, OrderedDict
from operator import itemgetter, attrgetter
Entry = namedtuple('Entry', 'key values')
return data
values = defaultdict(list)
- for v in data:
+ it = data.iteritems() if isinstance(data, dict) else enumerate(data)
+ for i, v in it:
k = self._keys[depth](v)
values[k].append(v)
- return dict( (k, self.map(values.get(k), depth+1)) for k in values )
+ keys = values.keys()
+ if self._sortKeys[depth]:
+ keys = sorted(keys, **self._sortKeys[depth])
+
+ return OrderedDict( (k, self.map(values.get(k), depth+1)) for k in keys )
def _entries(self, data, depth=0):
# Remove `cmp` if it exists, wrapping it to pluck the key from the entry-tuple
propCmp = keySort.pop('cmp', cmp)
# Then apply the sort using the rest of the specified settings
+ # def sorter(a,b):
+ # ret = propCmp(a['key'], b['key'])
+ # sign = '<' if ret < 0 else ('>' if ret > 0 else '=')
+ # print '%s %s %s' % (a['key'], sign, b['key'])
+ # return ret
+ # values = sorted(values, cmp=sorter, **keySort)
values = sorted(values, cmp=lambda a, b: propCmp(a['key'], b['key']), **keySort)
return values