From e69415bdc787d11e4908e040f7c6567db89a2eef Mon Sep 17 00:00:00 2001 From: dsc Date: Fri, 25 May 2012 10:39:49 -0700 Subject: [PATCH] Normalizes whitespace. --- classes.py | 174 ++++++++++++++-------------- stats.py | 372 ++++++++++++++++++++++++++++++------------------------------ 2 files changed, 276 insertions(+), 270 deletions(-) diff --git a/classes.py b/classes.py index 04b3096..7844d5f 100644 --- a/classes.py +++ b/classes.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- """ gerrit-stats: Generate codereview stats based from Gerrit commits Copyright (C) 2012 Diederik van Liere, Wikimedia Foundation @@ -21,98 +23,100 @@ import os from datetime import datetime class Metric(object): - ''' - The Metric class - ''' - def __init__(self, name, raw_query, settings): - self.raw_query = raw_query - self.name = name - self.query = 'ssh -p %s %s gerrit query --format=%s %s' % (settings.port, settings.host, settings.format, self.raw_query) + ''' + The Metric class + ''' + def __init__(self, name, raw_query, settings): + self.raw_query = raw_query + self.name = name + self.query = 'ssh -p %s %s gerrit query --format=%s %s' % (settings.port, settings.host, settings.format, self.raw_query) class Settings(object): - ''' - This object contains properties that apply to all repositories, including the queries that will be - run to generate the statistics, a list of repositories to ignore and a set of engineers that do not use - a WMF email address and hence will be classified as volunteer. - ''' - def __init__(self, settings): - self.queries = {'only+1':'-- CodeReview+1 -CodeReview+2 -CodeReview-1 -CodeReview-2', - 'no_review':'-- -CodeReview+1 -CodeReview-1 -CodeReview+2 -CodeReview-2', - } - self.whitelist=set(['niklas.laxstrom@gmail.com','roan.kattouw@gmail.com','maxsem.wiki@gmail.com','s.mazeland@xs4all.nl','jeroendedauw@gmail.com','mediawiki@danielfriesen.name','jdlrobson@gmail.com','hashar@free.fr']) - self.ignore_repos = ['test'] - self.metrics = {} - self.parents = ['mediawiki/core', - 'mediawiki/extensions', - 'operations', - 'analytics', - ] - - for name, query in self.queries.iteritems(): - self.metrics[name] = Metric(name, query, settings) - - def __str__(self): - return 'Metrics container object' + ''' + This object contains properties that apply to all repositories, including the queries that will be + run to generate the statistics, a list of repositories to ignore and a set of engineers that do not use + a WMF email address and hence will be classified as volunteer. + ''' + def __init__(self, settings): + self.queries = {'only+1':'-- CodeReview+1 -CodeReview+2 -CodeReview-1 -CodeReview-2', + 'no_review':'-- -CodeReview+1 -CodeReview-1 -CodeReview+2 -CodeReview-2', + } + self.whitelist=set(['niklas.laxstrom@gmail.com','roan.kattouw@gmail.com','maxsem.wiki@gmail.com','s.mazeland@xs4all.nl','jeroendedauw@gmail.com','mediawiki@danielfriesen.name','jdlrobson@gmail.com','hashar@free.fr']) + self.ignore_repos = ['test'] + self.metrics = {} + self.parents = ['mediawiki/core', + 'mediawiki/extensions', + 'operations', + 'analytics', + ] + + for name, query in self.queries.iteritems(): + self.metrics[name] = Metric(name, query, settings) + + def __str__(self): + return 'Metrics container object' class Gerrit(object): - ''' - This object contains the setings to interact with the gerrit server, nothing fancy these are just - sensible defaults. - ''' - def __init__(self): - self.data_location = 'data' - self.host = 'gerrit.wikimedia.org' - self.port = 29418 - self.format = 'JSON' - - def __str__(self): - return 'Codereview settings object.' + ''' + This object contains the setings to interact with the gerrit server, nothing fancy these are just + sensible defaults. + ''' + def __init__(self): + self.data_location = 'data' + self.host = 'gerrit.wikimedia.org' + self.port = 29418 + self.format = 'JSON' + + def __str__(self): + return 'Codereview settings object.' class Repo(object): - def __init__(self, name, settings, gerrit): - self.touched = False - self.name = name - self.dataset = {} - self.create_path(self.name, gerrit) - self.filename = ('%s.csv' % (self.determine_filename(self.name))) - self.filemode = self.determine_filemode(self.filename, gerrit) - - self.today = datetime.today() - self.email = {} - self.email['wikimedian'] = set() - self.email['volunteer'] = set() - self.num_metrics = 0 - for metric in settings.metrics: - self.dataset[metric] = {} - self.dataset[metric]['oldest'] = datetime(2030,1,1) - self.dataset[metric]['wikimedian'] = 0 - self.dataset[metric]['volunteer'] = 0 - self.dataset[metric]['total'] = 0 - self.num_metrics +=1 - - def __str__(self): - return self.name - - def create_path(self, filename, gerrit): - print filename - dir= os.path.dirname(filename) - if dir != '': - dir = os.path.join(gerrit.data_location, dir) - try: - os.makedirs(dir) - print 'Creating %s...' % dir - except OSError: - pass - - def determine_filename(self, filename): - return os.path.basename(filename) - - def determine_filemode(self, filename, settings): - if os.path.isfile('%s/%s' % (settings.data_location, filename)) == False: - return 'w' - else: - return 'a' + + def __init__(self, name, settings, gerrit): + self.touched = False + self.name = name + self.dataset = {} + self.create_path(self.name, gerrit) + self.filename = ('%s.csv' % (self.determine_filename(self.name))) + self.filemode = self.determine_filemode(self.filename, gerrit) + + self.today = datetime.today() + self.email = {} + self.email['wikimedian'] = set() + self.email['volunteer'] = set() + self.num_metrics = 0 + + for metric in settings.metrics: + self.dataset[metric] = {} + self.dataset[metric]['oldest'] = datetime(2030,1,1) + self.dataset[metric]['wikimedian'] = 0 + self.dataset[metric]['volunteer'] = 0 + self.dataset[metric]['total'] = 0 + self.num_metrics +=1 + + def __str__(self): + return self.name + + def create_path(self, filename, gerrit): + print filename + dir= os.path.dirname(filename) + if dir != '': + dir = os.path.join(gerrit.data_location, dir) + try: + os.makedirs(dir) + print 'Creating %s...' % dir + except OSError: + pass + + def determine_filename(self, filename): + return os.path.basename(filename) + + def determine_filemode(self, filename, settings): + if os.path.isfile('%s/%s' % (settings.data_location, filename)) == False: + return 'w' + else: + return 'a' diff --git a/stats.py b/stats.py index 40a9985..d1a295c 100644 --- a/stats.py +++ b/stats.py @@ -1,185 +1,187 @@ -""" -gerrit-stats: Generate codereview stats based from Gerrit commits -Copyright (C) 2012 Diederik van Liere, Wikimedia Foundation - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -""" -import subprocess -import json -import sys -import os -from datetime import datetime - -from classes import Gerrit, Settings, Metric, Repo - -def create_repo_set(gerrit, settings): - repos = {} - output = run_gerrit_query('ssh -p 29418 gerrit.wikimedia.org gerrit ls-projects') - output = output.split('\n') - for repo in output: - repo = repo.strip() - if len(repo) > 1: - tests = [repo.find(ignore) == -1 for ignore in settings.ignore_repos] - if all(tests): - rp = Repo(repo, settings, gerrit) - repos[rp.name] = rp - return repos - - -def is_wikimedian(email, whitelist): - if email in whitelist: - return True - if email.endswith('wikimedia.org'): - return True - else: - return False - - -def set_delimiter(fields, counter): - num_fields = len(fields) - if num_fields-counter != 1: - return ',' - else: - return '' - -def output_results(fh, *args): - args = [str(arg) for arg in args] - output = ''.join(args) - fh.write(output) - sys.stdout.write(output) - -def write_heading(fh, repo): - output_results(fh, 'data',',','repository',',') - #fh.write('%s,%s,' % ('date', 'repository')) - #sys.stdout.write('%s,%s,' % ('date', 'repository')) - for metric_counter, (name, metric) in enumerate(repo.dataset.iteritems()): - headings = metric.keys() - for counter, heading in enumerate(headings): - if metric_counter +1 == repo.num_metrics: - delim = set_delimiter(headings, counter) - else: - delim = ',' - #fh.write('%s_%s%s' % (name, heading, delim)) - #sys.stdout.write('%s_%s%s' % (name, heading, delim)) - output_results(fh, name,'_', heading, delim) - fh.write('\n') - sys.stdout.write('\n') - - -def construct_timestamp(epoch): - return datetime.fromtimestamp(epoch) - - -def run_gerrit_query(query): - query = query.split(' ') - output = subprocess.Popen(query, shell=False, stdout=subprocess.PIPE).communicate()[0] - return output - - -def create_dataset(repos, gerrit): - for key, repo in repos.iteritems(): - fh = open('%s/%s' % (gerrit.data_location, repo.filename), repo.filemode) - if repo.filemode == 'w': - write_heading(fh, repo) - #sys.stdout.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) - #fh.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) - output_results(fh, repo.today.month,'-',repo.today.day,'-',repo.today.year,',',repo.name,',') - print_dict(repo, fh) - sys.stdout.write('\n*****************\n') - sys.stdout.write('\n') - fh.write('\n') - fh.close() - - -def print_dict(repo, fh, ident = '', braces=1): - """ Recursively prints nested dictionaries.""" - dataset = repo.dataset - for metric_counter, metric in enumerate(dataset): - fields = dataset[metric].keys() - for counter, field in enumerate(fields): - if metric_counter +1 == repo.num_metrics: - delim = set_delimiter(fields, counter) - else: - delim = ',' - #print delim - sys.stdout.write('%s%s' % (dataset[metric][field], delim)) - fh.write('%s%s' % (dataset[metric][field], delim)) - - -def cleanup_volunteers(repos, whitelist): - for name, repo in repos.iteritems(): - for ws in whitelist: - if ws in repo.email['volunteer']: - repo.email['wikimedian'].add(ws) - repo.email['email']['volunteer'].remove(ws) - return repos - - -def construct_dataset(settings, repos, metric, output, gerrit): - output=output.split('\n') - for obs in output: - try: - obs= json.loads(obs) - except ValueError, e: - print e - - if isinstance(obs, dict) and 'rowCount' not in obs: - try: - project = obs['project'] - except KeyError, e: - print e, obs - email = obs['owner']['email'] - repo = repos.get(project, {}) - if repo == {}: - continue - dt = construct_timestamp(obs['createdOn']) - - # print "REPO: %s" % repo - # print "PROJECT: %s" % project - # print "METRIC: %s" % metric - # print "DATASET: %s" % repo.dataset - - if repo.dataset[metric]['oldest'] > dt: - repo.dataset[metric]['oldest'] = dt - repo.dataset[metric]['total'] +=1 - if is_wikimedian(email, settings.whitelist) == True: - repo.dataset[metric]['wikimedian'] +=1 - repo.email['wikimedian'].add(email) - else: - repo.dataset[metric]['volunteer'] +=1 - repo.email['volunteer'].add(email) - repo.touched = True - - -def main(): - gerrit = Gerrit() - settings = Settings(gerrit) - print 'Fetching list of all gerrit repositories...' - repos = create_repo_set(gerrit, settings) - - for metric in settings.metrics.itervalues(): - #query = 'ssh -p %s %s gerrit query --format=%s %s' % (gerrit.port, gerrit.host, gerrit.format, question) - output = run_gerrit_query(metric.query) - print 'Running %s' % metric.query - construct_dataset(settings, repos, metric.name, output, gerrit) - - print 'Fixing miscategorization of volunteer engineers...' - repos = cleanup_volunteers(repos, settings.whitelist) - print 'Creating datasets...' - create_dataset(repos, gerrit) - - -if __name__== '__main__': - main() \ No newline at end of file +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +gerrit-stats: Generate codereview stats based from Gerrit commits +Copyright (C) 2012 Diederik van Liere, Wikimedia Foundation + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" +import subprocess +import json +import sys +import os +from datetime import datetime + +from classes import Gerrit, Settings, Metric, Repo + +def create_repo_set(gerrit, settings): + repos = {} + output = run_gerrit_query('ssh -p 29418 gerrit.wikimedia.org gerrit ls-projects') + output = output.split('\n') + for repo in output: + repo = repo.strip() + if len(repo) > 1: + tests = [repo.find(ignore) == -1 for ignore in settings.ignore_repos] + if all(tests): + rp = Repo(repo, settings, gerrit) + repos[rp.name] = rp + return repos + + +def is_wikimedian(email, whitelist): + if email in whitelist: + return True + if email.endswith('wikimedia.org'): + return True + else: + return False + + +def set_delimiter(fields, counter): + num_fields = len(fields) + if num_fields-counter != 1: + return ',' + else: + return '' + +def output_results(fh, *args): + args = [str(arg) for arg in args] + output = ''.join(args) + fh.write(output) + sys.stdout.write(output) + +def write_heading(fh, repo): + output_results(fh, 'data',',','repository',',') + #fh.write('%s,%s,' % ('date', 'repository')) + #sys.stdout.write('%s,%s,' % ('date', 'repository')) + for metric_counter, (name, metric) in enumerate(repo.dataset.iteritems()): + headings = metric.keys() + for counter, heading in enumerate(headings): + if metric_counter +1 == repo.num_metrics: + delim = set_delimiter(headings, counter) + else: + delim = ',' + #fh.write('%s_%s%s' % (name, heading, delim)) + #sys.stdout.write('%s_%s%s' % (name, heading, delim)) + output_results(fh, name,'_', heading, delim) + fh.write('\n') + sys.stdout.write('\n') + + +def construct_timestamp(epoch): + return datetime.fromtimestamp(epoch) + + +def run_gerrit_query(query): + query = query.split(' ') + output = subprocess.Popen(query, shell=False, stdout=subprocess.PIPE).communicate()[0] + return output + + +def create_dataset(repos, gerrit): + for key, repo in repos.iteritems(): + fh = open('%s/%s' % (gerrit.data_location, repo.filename), repo.filemode) + if repo.filemode == 'w': + write_heading(fh, repo) + #sys.stdout.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) + #fh.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) + output_results(fh, repo.today.month,'-',repo.today.day,'-',repo.today.year,',',repo.name,',') + print_dict(repo, fh) + sys.stdout.write('\n*****************\n') + sys.stdout.write('\n') + fh.write('\n') + fh.close() + + +def print_dict(repo, fh, ident = '', braces=1): + """ Recursively prints nested dictionaries.""" + dataset = repo.dataset + for metric_counter, metric in enumerate(dataset): + fields = dataset[metric].keys() + for counter, field in enumerate(fields): + if metric_counter +1 == repo.num_metrics: + delim = set_delimiter(fields, counter) + else: + delim = ',' + #print delim + sys.stdout.write('%s%s' % (dataset[metric][field], delim)) + fh.write('%s%s' % (dataset[metric][field], delim)) + + +def cleanup_volunteers(repos, whitelist): + for name, repo in repos.iteritems(): + for ws in whitelist: + if ws in repo.email['volunteer']: + repo.email['wikimedian'].add(ws) + repo.email['email']['volunteer'].remove(ws) + return repos + + +def construct_dataset(settings, repos, metric, output, gerrit): + output=output.split('\n') + for obs in output: + try: + obs= json.loads(obs) + except ValueError, e: + print e + + if isinstance(obs, dict) and 'rowCount' not in obs: + try: + project = obs['project'] + except KeyError, e: + print e, obs + email = obs['owner']['email'] + repo = repos.get(project, {}) + if repo == {}: + continue + dt = construct_timestamp(obs['createdOn']) + + # print "REPO: %s" % repo + # print "PROJECT: %s" % project + # print "METRIC: %s" % metric + # print "DATASET: %s" % repo.dataset + + if repo.dataset[metric]['oldest'] > dt: + repo.dataset[metric]['oldest'] = dt + repo.dataset[metric]['total'] +=1 + if is_wikimedian(email, settings.whitelist) == True: + repo.dataset[metric]['wikimedian'] +=1 + repo.email['wikimedian'].add(email) + else: + repo.dataset[metric]['volunteer'] +=1 + repo.email['volunteer'].add(email) + repo.touched = True + + +def main(): + gerrit = Gerrit() + settings = Settings(gerrit) + print 'Fetching list of all gerrit repositories...' + repos = create_repo_set(gerrit, settings) + + for metric in settings.metrics.itervalues(): + #query = 'ssh -p %s %s gerrit query --format=%s %s' % (gerrit.port, gerrit.host, gerrit.format, question) + output = run_gerrit_query(metric.query) + print 'Running %s' % metric.query + construct_dataset(settings, repos, metric.name, output, gerrit) + + print 'Fixing miscategorization of volunteer engineers...' + repos = cleanup_volunteers(repos, settings.whitelist) + print 'Creating datasets...' + create_dataset(repos, gerrit) + + +if __name__== '__main__': + main() \ No newline at end of file -- 1.7.0.4