From 3e6153e9a4d869e810332383e54d4f9e374a02f7 Mon Sep 17 00:00:00 2001 From: Diederik Date: Thu, 24 May 2012 22:24:56 -0400 Subject: [PATCH] Initial commit of the gerrit-stats --- classes.py | 118 ++++++++++++++++++++++++++++++++++++++ stats.py | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 303 insertions(+), 0 deletions(-) create mode 100644 classes.py create mode 100644 stats.py diff --git a/classes.py b/classes.py new file mode 100644 index 0000000..04b3096 --- /dev/null +++ b/classes.py @@ -0,0 +1,118 @@ +""" +gerrit-stats: Generate codereview stats based from Gerrit commits +Copyright (C) 2012 Diederik van Liere, Wikimedia Foundation + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +import os +from datetime import datetime + +class Metric(object): + ''' + The Metric class + ''' + def __init__(self, name, raw_query, settings): + self.raw_query = raw_query + self.name = name + self.query = 'ssh -p %s %s gerrit query --format=%s %s' % (settings.port, settings.host, settings.format, self.raw_query) + + +class Settings(object): + ''' + This object contains properties that apply to all repositories, including the queries that will be + run to generate the statistics, a list of repositories to ignore and a set of engineers that do not use + a WMF email address and hence will be classified as volunteer. + ''' + def __init__(self, settings): + self.queries = {'only+1':'-- CodeReview+1 -CodeReview+2 -CodeReview-1 -CodeReview-2', + 'no_review':'-- -CodeReview+1 -CodeReview-1 -CodeReview+2 -CodeReview-2', + } + self.whitelist=set(['niklas.laxstrom@gmail.com','roan.kattouw@gmail.com','maxsem.wiki@gmail.com','s.mazeland@xs4all.nl','jeroendedauw@gmail.com','mediawiki@danielfriesen.name','jdlrobson@gmail.com','hashar@free.fr']) + self.ignore_repos = ['test'] + self.metrics = {} + self.parents = ['mediawiki/core', + 'mediawiki/extensions', + 'operations', + 'analytics', + ] + + for name, query in self.queries.iteritems(): + self.metrics[name] = Metric(name, query, settings) + + def __str__(self): + return 'Metrics container object' + + +class Gerrit(object): + ''' + This object contains the setings to interact with the gerrit server, nothing fancy these are just + sensible defaults. + ''' + def __init__(self): + self.data_location = 'data' + self.host = 'gerrit.wikimedia.org' + self.port = 29418 + self.format = 'JSON' + + def __str__(self): + return 'Codereview settings object.' + + +class Repo(object): + def __init__(self, name, settings, gerrit): + self.touched = False + self.name = name + self.dataset = {} + self.create_path(self.name, gerrit) + self.filename = ('%s.csv' % (self.determine_filename(self.name))) + self.filemode = self.determine_filemode(self.filename, gerrit) + + self.today = datetime.today() + self.email = {} + self.email['wikimedian'] = set() + self.email['volunteer'] = set() + self.num_metrics = 0 + for metric in settings.metrics: + self.dataset[metric] = {} + self.dataset[metric]['oldest'] = datetime(2030,1,1) + self.dataset[metric]['wikimedian'] = 0 + self.dataset[metric]['volunteer'] = 0 + self.dataset[metric]['total'] = 0 + self.num_metrics +=1 + + def __str__(self): + return self.name + + def create_path(self, filename, gerrit): + print filename + dir= os.path.dirname(filename) + if dir != '': + dir = os.path.join(gerrit.data_location, dir) + try: + os.makedirs(dir) + print 'Creating %s...' % dir + except OSError: + pass + + def determine_filename(self, filename): + return os.path.basename(filename) + + def determine_filemode(self, filename, settings): + if os.path.isfile('%s/%s' % (settings.data_location, filename)) == False: + return 'w' + else: + return 'a' + diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..40a9985 --- /dev/null +++ b/stats.py @@ -0,0 +1,185 @@ +""" +gerrit-stats: Generate codereview stats based from Gerrit commits +Copyright (C) 2012 Diederik van Liere, Wikimedia Foundation + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" +import subprocess +import json +import sys +import os +from datetime import datetime + +from classes import Gerrit, Settings, Metric, Repo + +def create_repo_set(gerrit, settings): + repos = {} + output = run_gerrit_query('ssh -p 29418 gerrit.wikimedia.org gerrit ls-projects') + output = output.split('\n') + for repo in output: + repo = repo.strip() + if len(repo) > 1: + tests = [repo.find(ignore) == -1 for ignore in settings.ignore_repos] + if all(tests): + rp = Repo(repo, settings, gerrit) + repos[rp.name] = rp + return repos + + +def is_wikimedian(email, whitelist): + if email in whitelist: + return True + if email.endswith('wikimedia.org'): + return True + else: + return False + + +def set_delimiter(fields, counter): + num_fields = len(fields) + if num_fields-counter != 1: + return ',' + else: + return '' + +def output_results(fh, *args): + args = [str(arg) for arg in args] + output = ''.join(args) + fh.write(output) + sys.stdout.write(output) + +def write_heading(fh, repo): + output_results(fh, 'data',',','repository',',') + #fh.write('%s,%s,' % ('date', 'repository')) + #sys.stdout.write('%s,%s,' % ('date', 'repository')) + for metric_counter, (name, metric) in enumerate(repo.dataset.iteritems()): + headings = metric.keys() + for counter, heading in enumerate(headings): + if metric_counter +1 == repo.num_metrics: + delim = set_delimiter(headings, counter) + else: + delim = ',' + #fh.write('%s_%s%s' % (name, heading, delim)) + #sys.stdout.write('%s_%s%s' % (name, heading, delim)) + output_results(fh, name,'_', heading, delim) + fh.write('\n') + sys.stdout.write('\n') + + +def construct_timestamp(epoch): + return datetime.fromtimestamp(epoch) + + +def run_gerrit_query(query): + query = query.split(' ') + output = subprocess.Popen(query, shell=False, stdout=subprocess.PIPE).communicate()[0] + return output + + +def create_dataset(repos, gerrit): + for key, repo in repos.iteritems(): + fh = open('%s/%s' % (gerrit.data_location, repo.filename), repo.filemode) + if repo.filemode == 'w': + write_heading(fh, repo) + #sys.stdout.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) + #fh.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name)) + output_results(fh, repo.today.month,'-',repo.today.day,'-',repo.today.year,',',repo.name,',') + print_dict(repo, fh) + sys.stdout.write('\n*****************\n') + sys.stdout.write('\n') + fh.write('\n') + fh.close() + + +def print_dict(repo, fh, ident = '', braces=1): + """ Recursively prints nested dictionaries.""" + dataset = repo.dataset + for metric_counter, metric in enumerate(dataset): + fields = dataset[metric].keys() + for counter, field in enumerate(fields): + if metric_counter +1 == repo.num_metrics: + delim = set_delimiter(fields, counter) + else: + delim = ',' + #print delim + sys.stdout.write('%s%s' % (dataset[metric][field], delim)) + fh.write('%s%s' % (dataset[metric][field], delim)) + + +def cleanup_volunteers(repos, whitelist): + for name, repo in repos.iteritems(): + for ws in whitelist: + if ws in repo.email['volunteer']: + repo.email['wikimedian'].add(ws) + repo.email['email']['volunteer'].remove(ws) + return repos + + +def construct_dataset(settings, repos, metric, output, gerrit): + output=output.split('\n') + for obs in output: + try: + obs= json.loads(obs) + except ValueError, e: + print e + + if isinstance(obs, dict) and 'rowCount' not in obs: + try: + project = obs['project'] + except KeyError, e: + print e, obs + email = obs['owner']['email'] + repo = repos.get(project, {}) + if repo == {}: + continue + dt = construct_timestamp(obs['createdOn']) + + # print "REPO: %s" % repo + # print "PROJECT: %s" % project + # print "METRIC: %s" % metric + # print "DATASET: %s" % repo.dataset + + if repo.dataset[metric]['oldest'] > dt: + repo.dataset[metric]['oldest'] = dt + repo.dataset[metric]['total'] +=1 + if is_wikimedian(email, settings.whitelist) == True: + repo.dataset[metric]['wikimedian'] +=1 + repo.email['wikimedian'].add(email) + else: + repo.dataset[metric]['volunteer'] +=1 + repo.email['volunteer'].add(email) + repo.touched = True + + +def main(): + gerrit = Gerrit() + settings = Settings(gerrit) + print 'Fetching list of all gerrit repositories...' + repos = create_repo_set(gerrit, settings) + + for metric in settings.metrics.itervalues(): + #query = 'ssh -p %s %s gerrit query --format=%s %s' % (gerrit.port, gerrit.host, gerrit.format, question) + output = run_gerrit_query(metric.query) + print 'Running %s' % metric.query + construct_dataset(settings, repos, metric.name, output, gerrit) + + print 'Fixing miscategorization of volunteer engineers...' + repos = cleanup_volunteers(repos, settings.whitelist) + print 'Creating datasets...' + create_dataset(repos, gerrit) + + +if __name__== '__main__': + main() \ No newline at end of file -- 1.7.0.4