Initial commit of the gerrit-stats
authorDiederik <dvanliere@wikimedia.org>
Fri, 25 May 2012 02:24:56 +0000 (22:24 -0400)
committerDiederik <dvanliere@wikimedia.org>
Fri, 25 May 2012 02:24:56 +0000 (22:24 -0400)
classes.py [new file with mode: 0644]
stats.py [new file with mode: 0644]

diff --git a/classes.py b/classes.py
new file mode 100644 (file)
index 0000000..04b3096
--- /dev/null
@@ -0,0 +1,118 @@
+"""
+gerrit-stats: Generate codereview stats based from Gerrit commits
+Copyright (C) 2012  Diederik van Liere, Wikimedia Foundation
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+"""
+
+import os
+from datetime import datetime
+
+class Metric(object):
+       '''
+       The Metric class 
+       '''
+       def __init__(self, name, raw_query, settings):
+               self.raw_query = raw_query
+               self.name = name
+               self.query = 'ssh -p %s %s gerrit query --format=%s %s' % (settings.port, settings.host, settings.format, self.raw_query)
+
+
+class Settings(object):
+       '''
+       This object contains properties that apply to all repositories, including the queries that will be 
+       run to generate the statistics, a list of repositories to ignore and a set of engineers that do not use
+       a WMF email address and hence will be classified as volunteer.
+       '''
+       def __init__(self, settings):
+               self.queries = {'only+1':'-- CodeReview+1 -CodeReview+2 -CodeReview-1 -CodeReview-2',
+                                               'no_review':'-- -CodeReview+1 -CodeReview-1 -CodeReview+2 -CodeReview-2',
+                                               }
+               self.whitelist=set(['niklas.laxstrom@gmail.com','roan.kattouw@gmail.com','maxsem.wiki@gmail.com','s.mazeland@xs4all.nl','jeroendedauw@gmail.com','mediawiki@danielfriesen.name','jdlrobson@gmail.com','hashar@free.fr'])
+               self.ignore_repos = ['test']
+               self.metrics =  {}
+               self.parents = ['mediawiki/core',
+                                               'mediawiki/extensions',
+                                               'operations',
+                                               'analytics',    
+                                               ]
+
+               for name, query in self.queries.iteritems():
+                       self.metrics[name] = Metric(name, query, settings)
+
+       def __str__(self):
+               return 'Metrics container object'
+
+
+class Gerrit(object):
+       '''
+       This object contains the setings to interact with the gerrit server, nothing fancy these are just
+       sensible defaults.
+       '''
+       def __init__(self):
+               self.data_location = 'data'
+               self.host = 'gerrit.wikimedia.org'
+               self.port = 29418
+               self.format = 'JSON'
+
+       def __str__(self):
+               return 'Codereview settings object.'
+
+
+class Repo(object):
+       def __init__(self, name, settings, gerrit):
+               self.touched = False
+               self.name = name
+               self.dataset = {}
+               self.create_path(self.name, gerrit)
+               self.filename = ('%s.csv' % (self.determine_filename(self.name)))
+               self.filemode = self.determine_filemode(self.filename, gerrit)
+
+               self.today = datetime.today()
+               self.email = {}
+               self.email['wikimedian'] = set()
+               self.email['volunteer'] = set()
+               self.num_metrics = 0
+               for metric in settings.metrics:
+                       self.dataset[metric] = {}
+                       self.dataset[metric]['oldest'] = datetime(2030,1,1)
+                       self.dataset[metric]['wikimedian'] = 0
+                       self.dataset[metric]['volunteer'] = 0
+                       self.dataset[metric]['total'] = 0
+                       self.num_metrics +=1
+
+       def __str__(self):
+               return self.name
+
+       def create_path(self, filename, gerrit):
+               print filename
+               dir= os.path.dirname(filename)
+               if dir != '':
+                       dir = os.path.join(gerrit.data_location, dir)
+                       try:
+                               os.makedirs(dir)
+                               print 'Creating %s...' % dir
+                       except OSError:
+                               pass
+               
+       def determine_filename(self, filename):
+               return os.path.basename(filename)
+
+       def determine_filemode(self, filename, settings):
+               if os.path.isfile('%s/%s' % (settings.data_location, filename)) == False:
+                       return 'w'
+               else:
+                       return 'a'
+
diff --git a/stats.py b/stats.py
new file mode 100644 (file)
index 0000000..40a9985
--- /dev/null
+++ b/stats.py
@@ -0,0 +1,185 @@
+"""\r
+gerrit-stats: Generate codereview stats based from Gerrit commits\r
+Copyright (C) 2012  Diederik van Liere, Wikimedia Foundation\r
+\r
+This program is free software; you can redistribute it and/or\r
+modify it under the terms of the GNU General Public License\r
+as published by the Free Software Foundation; either version 2\r
+of the License, or (at your option) any later version.\r
+\r
+This program is distributed in the hope that it will be useful,\r
+but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
+GNU General Public License for more details.\r
+\r
+You should have received a copy of the GNU General Public License\r
+along with this program; if not, write to the Free Software\r
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\r
+"""\r
+import subprocess\r
+import json\r
+import sys\r
+import os\r
+from datetime import datetime\r
+\r
+from classes import Gerrit, Settings, Metric, Repo\r
+\r
+def create_repo_set(gerrit, settings):\r
+       repos = {}\r
+       output = run_gerrit_query('ssh -p 29418 gerrit.wikimedia.org gerrit ls-projects')\r
+       output = output.split('\n')\r
+       for repo in output:\r
+               repo = repo.strip()\r
+               if len(repo) > 1:\r
+                       tests = [repo.find(ignore) == -1 for ignore in settings.ignore_repos]\r
+                       if all(tests):\r
+                               rp = Repo(repo, settings, gerrit)\r
+                               repos[rp.name] = rp\r
+       return repos\r
+\r
+\r
+def is_wikimedian(email, whitelist):\r
+       if email in whitelist:\r
+               return True\r
+       if email.endswith('wikimedia.org'):\r
+               return True\r
+       else:\r
+               return False\r
+\r
+\r
+def set_delimiter(fields, counter):\r
+       num_fields = len(fields)\r
+       if num_fields-counter != 1:\r
+               return ','\r
+       else:\r
+               return ''\r
+\r
+def output_results(fh, *args):\r
+       args = [str(arg) for arg in args]\r
+       output = ''.join(args)\r
+       fh.write(output)\r
+       sys.stdout.write(output)\r
+\r
+def write_heading(fh, repo):\r
+       output_results(fh, 'data',',','repository',',')\r
+       #fh.write('%s,%s,' % ('date', 'repository'))\r
+       #sys.stdout.write('%s,%s,' % ('date', 'repository'))\r
+       for metric_counter, (name, metric) in enumerate(repo.dataset.iteritems()):\r
+               headings = metric.keys()\r
+               for counter, heading in enumerate(headings):\r
+                       if metric_counter +1 == repo.num_metrics:\r
+                               delim = set_delimiter(headings, counter)\r
+                       else:\r
+                               delim = ','\r
+                       #fh.write('%s_%s%s' % (name, heading, delim))\r
+                       #sys.stdout.write('%s_%s%s' % (name, heading, delim))\r
+                       output_results(fh, name,'_', heading, delim)\r
+       fh.write('\n')\r
+       sys.stdout.write('\n')\r
+\r
+\r
+def construct_timestamp(epoch):\r
+       return datetime.fromtimestamp(epoch)\r
+\r
+\r
+def run_gerrit_query(query):\r
+       query = query.split(' ')\r
+       output = subprocess.Popen(query, shell=False, stdout=subprocess.PIPE).communicate()[0]\r
+       return output\r
+\r
+\r
+def create_dataset(repos, gerrit):\r
+       for key, repo in repos.iteritems():\r
+               fh = open('%s/%s' % (gerrit.data_location, repo.filename), repo.filemode)\r
+               if repo.filemode == 'w':\r
+                       write_heading(fh, repo)\r
+               #sys.stdout.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name))\r
+               #fh.write('%s-%s-%s,%s,' % (repo.today.month,repo.today.day,repo.today.year, repo.name))\r
+               output_results(fh, repo.today.month,'-',repo.today.day,'-',repo.today.year,',',repo.name,',')\r
+               print_dict(repo, fh)\r
+               sys.stdout.write('\n*****************\n')\r
+               sys.stdout.write('\n')\r
+               fh.write('\n')\r
+               fh.close()\r
+\r
+\r
+def print_dict(repo, fh, ident = '', braces=1):\r
+       """ Recursively prints nested dictionaries."""\r
+       dataset = repo.dataset\r
+       for metric_counter, metric in enumerate(dataset):\r
+               fields = dataset[metric].keys()\r
+               for counter, field in enumerate(fields):\r
+                       if metric_counter +1 == repo.num_metrics:\r
+                               delim = set_delimiter(fields, counter)\r
+                       else:\r
+                               delim = ','\r
+                       #print delim\r
+                       sys.stdout.write('%s%s' % (dataset[metric][field], delim))\r
+                       fh.write('%s%s' % (dataset[metric][field], delim))\r
+\r
+\r
+def cleanup_volunteers(repos, whitelist):\r
+       for name, repo in repos.iteritems():\r
+               for ws in whitelist:\r
+                       if ws in repo.email['volunteer']:\r
+                               repo.email['wikimedian'].add(ws)\r
+                               repo.email['email']['volunteer'].remove(ws)\r
+       return repos\r
+\r
+\r
+def construct_dataset(settings, repos, metric, output, gerrit):                \r
+       output=output.split('\n')\r
+       for obs in output:\r
+               try:\r
+                       obs= json.loads(obs)\r
+               except ValueError, e:\r
+                       print e\r
+\r
+               if isinstance(obs, dict) and 'rowCount' not in obs:\r
+                       try:\r
+                               project = obs['project']\r
+                       except KeyError, e:\r
+                               print e, obs\r
+                       email = obs['owner']['email']\r
+                       repo = repos.get(project, {})\r
+                       if repo == {}:\r
+                               continue\r
+                       dt = construct_timestamp(obs['createdOn'])\r
+                       \r
+                       # print "REPO: %s" % repo\r
+                       # print "PROJECT: %s" % project\r
+                       # print "METRIC: %s" % metric\r
+                       # print "DATASET: %s" % repo.dataset\r
+\r
+                       if repo.dataset[metric]['oldest'] > dt:\r
+                               repo.dataset[metric]['oldest'] = dt\r
+                       repo.dataset[metric]['total'] +=1\r
+                       if is_wikimedian(email, settings.whitelist) == True:\r
+                               repo.dataset[metric]['wikimedian'] +=1\r
+                               repo.email['wikimedian'].add(email)\r
+                       else:\r
+                               repo.dataset[metric]['volunteer'] +=1\r
+                               repo.email['volunteer'].add(email)\r
+                       repo.touched = True\r
+\r
+\r
+def main():\r
+       gerrit = Gerrit()\r
+       settings = Settings(gerrit)\r
+       print 'Fetching list of all gerrit repositories...'\r
+       repos = create_repo_set(gerrit, settings)\r
\r
+       for metric in settings.metrics.itervalues():\r
+               #query = 'ssh -p %s %s gerrit query --format=%s %s' % (gerrit.port, gerrit.host, gerrit.format, question)\r
+               output = run_gerrit_query(metric.query)\r
+               print 'Running %s' % metric.query\r
+               construct_dataset(settings, repos, metric.name, output, gerrit)\r
+\r
+       print 'Fixing miscategorization of volunteer engineers...'\r
+       repos = cleanup_volunteers(repos, settings.whitelist)\r
+       print 'Creating datasets...'\r
+       create_dataset(repos, gerrit)\r
+\r
+\r
+if __name__== '__main__':\r
+       main()
\ No newline at end of file