python source code of repository

"""
.. module:: repository
   :platform: Unix, Windows
   :synopsis: A module for examining a single git repository

.. moduleauthor:: Will McGinnis <will@pedalwrencher.com>


"""

import os
import sys
import datetime
import time
import json
import logging
import tempfile
import fnmatch
import shutil
import warnings
import numpy as np
from git import Repo, GitCommandError
from gitpandas.cache import multicache, EphemeralCache, RedisDFCache
from pandas import DataFrame, to_datetime

try:
    from joblib import delayed, Parallel

    _has_joblib = True
except ImportError as e:
    _has_joblib = False

__author__ = 'willmcginnis'


def _parallel_cumulative_blame_func(self_, x, committer, ignore_globs, include_globs):
    blm = self_.blame(
        rev=x['rev'],
        committer=committer,
        ignore_globs=ignore_globs,
        include_globs=include_globs
    )
    x.update(json.loads(blm.to_json())['loc'])

    return x


class Repository(object):
    """
    The base class for a generic git repository, from which to gather statistics.  The object encapulates a single
    gitpython Repo instance.

    :param working_dir: the directory of the git repository, meaning a .git directory is in it (default None=cwd)
    :param verbose: optional, verbosity level of output, bool
    :param tmp_dir: optional, a path to clone the repo into if necessary. Will create one if none passed.
    :param cache_backend: optional, an instantiated cache backend from gitpandas.cache
    :return:
    """

    def __init__(self, working_dir=None, verbose=False, tmp_dir=None, cache_backend=None):
        self.verbose = verbose
        self.log = logging.getLogger('gitpandas')
        self.__delete_hook = False
        self._git_repo_name = None
        self.cache_backend = cache_backend
        if working_dir is not None:
            if working_dir[:3] == 'git':
                # if a tmp dir is passed, clone into that, otherwise make a temp directory.
                if tmp_dir is None:
                    if self.verbose:
                        print('cloning repository: %s into a temporary location' % (working_dir,))
                    dir_path = tempfile.mkdtemp()
                else:
                    dir_path = tmp_dir

                self.repo = Repo.clone_from(working_dir, dir_path)
                self._git_repo_name = working_dir.split(os.sep)[-1].split('.')[0]
                self.git_dir = dir_path
                self.__delete_hook = True
            else:
                self.git_dir = working_dir
                self.repo = Repo(self.git_dir)
        else:
            self.git_dir = os.getcwd()
            self.repo = Repo(self.git_dir)

        if self.verbose:
            print('Repository [%s] instantiated at directory: %s' % (self._repo_name(), self.git_dir))

    def __del__(self):
        """
        On delete, clean up any temporary repositories still hanging around

        :return:
        """
        if self.__delete_hook:
            if os.path.exists(self.git_dir):
                shutil.rmtree(self.git_dir)

    def is_bare(self):
        """
        Returns a boolean for if the repo is bare or not

        :return: bool
        """

        return self.repo.bare

    def has_coverage(self):
        """
        Returns a boolean for is a parseable .coverage file can be found in the repository

        :return: bool

        """

        if os.path.exists(self.git_dir + os.sep + '.coverage'):
            try:
                with open(self.git_dir + os.sep + '.coverage', 'r') as f:
                    blob = f.read()
                    blob = blob.split('!')[2]
                    json.loads(blob)
                return True
            except Exception:
                return False
        else:
            return False

    def coverage(self):
        """
        If there is a .coverage file available, this will attempt to form a DataFrame with that information in it, which
        will contain the columns:

         * filename
         * lines_covered
         * total_lines
         * coverage

        If it can't be found or parsed, an empty DataFrame of that form will be returned.

        :return: DataFrame
        """

        if not self.has_coverage():
            return DataFrame(columns=['filename', 'lines_covered', 'total_lines', 'coverage'])

        with open(self.git_dir + os.sep + '.coverage', 'r') as f:
            blob = f.read()
            blob = blob.split('!')[2]
            cov = json.loads(blob)

        ds = []
        for filename in cov['lines'].keys():
            idx = 0
            try:
                with open(filename, 'r') as f:
                    for idx, _ in enumerate(f):
                        pass
            except FileNotFoundError as e:
                if self.verbose:
                    warnings.warn('Could not find file %s for coverage' % (filename, ))

            num_lines = idx + 1

            try:
                short_filename = filename.split(self.git_dir + os.sep)[1]
                ds.append([short_filename, len(cov['lines'][filename]), num_lines])
            except IndexError as e:
                if self.verbose:
                    warnings.warn('Could not find file %s for coverage' % (filename, ))

        df = DataFrame(ds, columns=['filename', 'lines_covered', 'total_lines'])
        df['coverage'] = df['lines_covered'] / df['total_lines']

        return df

    def hours_estimate(self, branch='master', grouping_window=0.5, single_commit_hours=0.5, limit=None, days=None,
                       committer=True, ignore_globs=None, include_globs=None):
        """
        inspired by: https://github.com/kimmobrunfeldt/git-hours/blob/8aaeee237cb9d9028e7a2592a25ad8468b1f45e4/index.js#L114-L143

        Iterates through the commit history of repo to estimate the time commitement of each author or committer over
        the course of time indicated by limit/extensions/days/etc.

        :param branch: the branch to return commits for
        :param limit: (optional, default=None) a maximum number of commits to return, None for no limit
        :param grouping_window: (optional, default=0.5 hours) the threhold for how close two commits need to be to consider them part of one coding session
        :param single_commit_hours: (optional, default 0.5 hours) the time range to associate with one single commit
        :param days: (optional, default=None) number of days to return, if limit is None
        :param committer: (optional, default=True) whether to use committer vs. author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        max_diff_in_minutes = grouping_window * 60.0
        first_commit_addition_in_minutes = single_commit_hours * 60.0

        # First get the commit history
        ch = self.commit_history(branch=branch, limit=limit, days=days, ignore_globs=ignore_globs,
                                 include_globs=include_globs)

        # split by committer|author
        if committer:
            by = 'committer'
        else:
            by = 'author'
        people = set(ch[by].values)

        ds = []
        for person in people:
            commits = ch[ch[by] == person]
            commits_ts = [x * 10e-10 for x in sorted(commits.index.values.tolist())]

            if len(commits_ts) < 2:
                ds.append([person, 0])
                continue

            def estimate(index, date):
                next_ts = commits_ts[index + 1]
                diff_in_minutes = next_ts - date
                diff_in_minutes /= 60.0
                if diff_in_minutes < max_diff_in_minutes:
                    return diff_in_minutes / 60.0
                return first_commit_addition_in_minutes / 60.0

            hours = [estimate(a, b) for a, b in enumerate(commits_ts[:-1])]
            hours = sum(hours)
            ds.append([person, hours])

        df = DataFrame(ds, columns=[by, 'hours'])

        return df

    def commit_history(self, branch='master', limit=None, days=None, ignore_globs=None, include_globs=None):
        """
        Returns a pandas DataFrame containing all of the commits for a given branch. Included in that DataFrame will be
        the columns:

         * date (index)
         * author
         * committer
         * message
         * lines
         * insertions
         * deletions
         * net

        :param branch: the branch to return commits for
        :param limit: (optional, default=None) a maximum number of commits to return, None for no limit
        :param days: (optional, default=None) number of days to return, if limit is None
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        # setup the data-set of commits
        if limit is None:
            if days is None:
                ds = [[
                          x.author.name,
                          x.committer.name,
                          x.committed_date,
                          x.message,
                          self.__check_extension(x.stats.files, ignore_globs=ignore_globs, include_globs=include_globs)
                      ] for x in self.repo.iter_commits(branch, max_count=sys.maxsize)]
            else:
                ds = []
                c_date = time.time()
                commits = self.repo.iter_commits(branch, max_count=sys.maxsize)
                dlim = time.time() - days * 24 * 3600
                while c_date > dlim:
                    try:
                        if sys.version_info.major == 2:
                            x = commits.next()
                        else:
                            x = commits.__next__()
                    except StopIteration:
                        break
                    c_date = x.committed_date
                    if c_date > dlim:
                        ds.append([
                            x.author.name,
                            x.committer.name,
                            x.committed_date,
                            x.message,
                            self.__check_extension(x.stats.files, ignore_globs=ignore_globs,
                                                   include_globs=include_globs)
                        ])

        else:
            ds = [[
                      x.author.name,
                      x.committer.name,
                      x.committed_date,
                      x.message,
                      self.__check_extension(x.stats.files, ignore_globs=ignore_globs, include_globs=include_globs)
                  ] for x in self.repo.iter_commits(branch, max_count=limit)]

        # aggregate stats
        ds = [x[:-1] + [sum([x[-1][key]['lines'] for key in x[-1].keys()]),
                        sum([x[-1][key]['insertions'] for key in x[-1].keys()]),
                        sum([x[-1][key]['deletions'] for key in x[-1].keys()]),
                        sum([x[-1][key]['insertions'] for key in x[-1].keys()]) - sum(
                            [x[-1][key]['deletions'] for key in x[-1].keys()])
                        ] for x in ds if len(x[-1].keys()) > 0]

        # make it a pandas dataframe
        df = DataFrame(ds,
                       columns=['author', 'committer', 'date', 'message', 'lines', 'insertions', 'deletions', 'net'])

        # format the date col and make it the index
        df['date'] = to_datetime(df['date'].map(datetime.datetime.fromtimestamp))
        df.set_index(keys=['date'], drop=True, inplace=True)

        return df

    def file_change_history(self, branch='master', limit=None, days=None, ignore_globs=None, include_globs=None):
        """
        Returns a DataFrame of all file changes (via the commit history) for the specified branch.  This is similar to
        the commit history DataFrame, but is one row per file edit rather than one row per commit (which may encapsulate
        many file changes). Included in the DataFrame will be the columns:

         * date (index)
         * author
         * committer
         * message
         * filename
         * insertions
         * deletions

        :param branch: the branch to return commits for
        :param limit: (optional, default=None) a maximum number of commits to return, None for no limit
        :param days: (optional, default=None) number of days to return if limit is None
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        # setup the dataset of commits
        if limit is None:
            if days is None:
                ds = [[
                          x.author.name,
                          x.committer.name,
                          x.committed_date,
                          x.message,
                          x.name_rev.split()[0],
                          self.__check_extension(x.stats.files, ignore_globs=ignore_globs, include_globs=include_globs)
                      ] for x in self.repo.iter_commits(branch, max_count=sys.maxsize)]
            else:
                ds = []
                c_date = time.time()
                commits = self.repo.iter_commits(branch, max_count=sys.maxsize)
                dlim = time.time() - days * 24 * 3600
                while c_date > dlim:
                    try:
                        if sys.version_info.major == 2:
                            x = commits.next()
                        else:
                            x = commits.__next__()
                    except StopIteration:
                        break

                    c_date = x.committed_date
                    if c_date > dlim:
                        ds.append([
                            x.author.name,
                            x.committer.name,
                            x.committed_date,
                            x.message,
                            x.name_rev.split()[0],
                            self.__check_extension(x.stats.files, ignore_globs=ignore_globs,
                                                   include_globs=include_globs)
                        ])

        else:
            ds = [[
                      x.author.name,
                      x.committer.name,
                      x.committed_date,
                      x.message,
                      x.name_rev.split()[0],
                      self.__check_extension(x.stats.files, ignore_globs=ignore_globs, include_globs=include_globs)
                  ] for x in self.repo.iter_commits(branch, max_count=limit)]

        ds = [x[:-1] + [fn, x[-1][fn]['insertions'], x[-1][fn]['deletions']] for x in ds for fn in x[-1].keys() if
              len(x[-1].keys()) > 0]

        # make it a pandas dataframe
        df = DataFrame(ds,
                       columns=['author', 'committer', 'date', 'message', 'rev', 'filename', 'insertions', 'deletions'])

        # format the date col and make it the index
        df['date'] = to_datetime(df['date'].map(datetime.datetime.fromtimestamp))
        df.set_index(keys=['date'], drop=True, inplace=True)

        return df

    def file_change_rates(self, branch='master', limit=None, coverage=False, days=None, ignore_globs=None,
                          include_globs=None):
        """
        This function will return a DataFrame containing some basic aggregations of the file change history data, and
        optionally test coverage data from a coverage_data.py .coverage file.  The aim here is to identify files in the
        project which have abnormal edit rates, or the rate of changes without growing the files size.  If a file has
        a high change rate and poor test coverage, then it is a great candidate for writing more tests.

        :param branch: (optional, default=master) the branch to return commits for
        :param limit: (optional, default=None) a maximum number of commits to return, None for no limit
        :param coverage: (optional, default=False) a bool for whether or not to attempt to join in coverage data.
        :param days: (optional, default=None) number of days to return if limit is None
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        fch = self.file_change_history(
            branch=branch,
            limit=limit,
            days=days,
            ignore_globs=ignore_globs,
            include_globs=include_globs
        )
        fch.reset_index(level=0, inplace=True)

        if fch.shape[0] > 0:
            file_history = fch.groupby('filename').agg(
                {
                    'insertions': [np.sum, np.max, np.mean],
                    'deletions': [np.sum, np.max, np.mean],
                    'message': lambda x: ','.join(['"' + str(y) + '"' for y in x]),
                    'committer': lambda x: ','.join(['"' + str(y) + '"' for y in x]),
                    'author': lambda x: ','.join(['"' + str(y) + '"' for y in x]),
                    'date': [np.max, np.min]
                }
            )

            file_history.columns = [' '.join(col).strip() for col in file_history.columns.values]

            file_history = file_history.rename(columns={
                'message <lambda>': 'messages',
                'committer <lambda>': 'committers',
                'insertions sum': 'total_insertions',
                'insertions amax': 'max_insertions',
                'insertions mean': 'mean_insertions',
                'author <lambda>': 'authors',
                'date amax': 'max_date',
                'date amin': 'min_date',
                'deletions sum': 'total_deletions',
                'deletions amax': 'max_deletions',
                'deletions mean': 'mean_deletions'
            })

            # get some building block values for later use
            file_history['net_change'] = file_history['total_insertions'] - file_history['total_deletions']
            file_history['abs_change'] = file_history['total_insertions'] + file_history['total_deletions']
            file_history['delta_time'] = file_history['max_date'] - file_history['min_date']

            try:
                file_history['delta_days'] = file_history['delta_time'].map(
                    lambda x: np.ceil(x.seconds / (24 * 3600) + 0.01))
            except AttributeError as e:
                file_history['delta_days'] = file_history['delta_time'].map(
                    lambda x: np.ceil((float(x.total_seconds()) * 10e-6) / (24 * 3600) + 0.01))

            # calculate metrics
            file_history['net_rate_of_change'] = file_history['net_change'] / file_history['delta_days']
            file_history['abs_rate_of_change'] = file_history['abs_change'] / file_history['delta_days']
            file_history['edit_rate'] = file_history['abs_rate_of_change'] - file_history['net_rate_of_change']
            file_history['unique_committers'] = file_history['committers'].map(lambda x: len(set(x.split(','))))

            # reindex
            file_history = file_history.reindex(
                columns=['unique_committers', 'abs_rate_of_change', 'net_rate_of_change', 'net_change', 'abs_change',
                         'edit_rate'])
            file_history.sort_values(by=['edit_rate'], inplace=True)

            if coverage and self.has_coverage():
                file_history = file_history.merge(self.coverage(), left_index=True, right_on='filename', how='outer')
                file_history.set_index(keys=['filename'], drop=True, inplace=True)
        else:
            file_history = DataFrame(
                columns=['unique_committers', 'abs_rate_of_change', 'net_rate_of_change', 'net_change', 'abs_change',
                         'edit_rate'])

        return file_history

    @staticmethod
    def __check_extension(files, ignore_globs=None, include_globs=None):
        """
        Internal method to filter a list of file changes by extension and ignore_dirs.

        :param files:
        :param ignore_globs: a list of globs to ignore (if none falls back to extensions and ignore_dir)
        :param include_globs: a list of globs to include (if none, includes all).
        :return: dict
        """

        if include_globs is None or include_globs == []:
            include_globs = ['*']

        out = {}
        for key in files.keys():
            # count up the number of patterns in the ignore globs list that match
            if ignore_globs is not None:
                count_exclude = sum([1 if fnmatch.fnmatch(key, g) else 0 for g in ignore_globs])
            else:
                count_exclude = 0

            # count up the number of patterns in the include globs list that match
            count_include = sum([1 if fnmatch.fnmatch(key, g) else 0 for g in include_globs])

            # if we have one vote or more to include and none to exclude, then we use the file.
            if count_include > 0 and count_exclude == 0:
                out[key] = files[key]

        return out

    @multicache(
        key_prefix='blame',
        key_list=['rev', 'committer', 'by', 'ignore_blobs', 'include_globs'],
        skip_if=lambda x: True if x.get('rev') is None or x.get('rev') == 'HEAD' else False
    )
    def blame(self, rev='HEAD', committer=True, by='repository', ignore_globs=None, include_globs=None):
        """
        Returns the blame from the current HEAD of the repository as a DataFrame.  The DataFrame is grouped by committer
        name, so it will be the sum of all contributions to the repository by each committer. As with the commit history
        method, extensions and ignore_dirs parameters can be passed to exclude certain directories, or focus on certain
        file extensions. The DataFrame will have the columns:

         * committer
         * loc

        :param rev: (optional, default=HEAD) the specific revision to blame
        :param committer: (optional, default=True) true if committer should be reported, false if author
        :param by: (optional, default=repository) whether to group by repository or by file
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        blames = []
        file_names = [x for x in self.repo.git.log(pretty='format:', name_only=True, diff_filter='A').split('\n') if
                      x.strip() != '']
        for file in self.__check_extension({x: x for x in file_names}, ignore_globs=ignore_globs,
                                           include_globs=include_globs).keys():
            try:
                blames.append(
                    [x + [str(file).replace(self.git_dir + '/', '')] for x in
                     self.repo.blame(rev, str(file).replace(self.git_dir + '/', ''))]
                )
            except GitCommandError:
                pass

        blames = [item for sublist in blames for item in sublist]
        if committer:
            if by == 'repository':
                blames = DataFrame(
                    [[x[0].committer.name, len(x[1])] for x in blames],
                    columns=['committer', 'loc']
                ).groupby('committer').agg({'loc': np.sum})
            elif by == 'file':
                blames = DataFrame(
                    [[x[0].committer.name, len(x[1]), x[2]] for x in blames],
                    columns=['committer', 'loc', 'file']
                ).groupby(['committer', 'file']).agg({'loc': np.sum})
        else:
            if by == 'repository':
                blames = DataFrame(
                    [[x[0].author.name, len(x[1])] for x in blames],
                    columns=['author', 'loc']
                ).groupby('author').agg({'loc': np.sum})
            elif by == 'file':
                blames = DataFrame(
                    [[x[0].author.name, len(x[1]), x[2]] for x in blames],
                    columns=['author', 'loc', 'file']
                ).groupby(['author', 'file']).agg({'loc': np.sum})

        return blames

    def revs(self, branch='master', limit=None, skip=None, num_datapoints=None):
        """
        Returns a dataframe of all revision tags and their timestamps. It will have the columns:

         * date
         * rev

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :return: DataFrame

        """

        if limit is None and skip is None and num_datapoints is not None:
            limit = sum(1 for _ in self.repo.iter_commits())
            skip = int(float(limit) / num_datapoints)
        else:
            if limit is None:
                limit = sys.maxsize
            elif skip is not None:
                limit = limit * skip

        ds = [[x.committed_date, x.name_rev.split(' ')[0]] for x in self.repo.iter_commits(branch, max_count=limit)]
        df = DataFrame(ds, columns=['date', 'rev'])

        if skip is not None:
            if skip == 0:
                skip = 1

            if df.shape[0] >= skip:
                df = df.ix[range(0, df.shape[0], skip)]
                df.reset_index()
            else:
                df = df.ix[[0]]
                df.reset_index()

        return df

    def cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                         ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame

        """

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        # get the commit history to stub out committers (hacky and slow)
        if sys.version_info.major == 2:
            committers = set([x.committer.name for x in self.repo.iter_commits(branch, max_count=sys.maxsize)])
        else:
            committers = {x.committer.name for x in self.repo.iter_commits(branch, max_count=sys.maxsize)}

        for y in committers:
            revs[y] = 0

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        # now populate that table with some actual values
        for idx, row in revs.iterrows():
            if self.verbose:
                print('%s. [%s] getting blame for rev: %s' % (
                str(idx), datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'), row.rev,))

            blame = self.blame(rev=row.rev, committer=committer, ignore_globs=ignore_globs, include_globs=include_globs)
            for y in committers:
                try:
                    loc = blame.loc[y, 'loc']
                    revs.set_value(idx, y, loc)
                except KeyError:
                    pass

        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]

        return revs

    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs

    def branches(self):
        """
        Returns a data frame of all branches in origin.  The DataFrame will have the columns:

         * repository
         * branch
         * local

        :returns: DataFrame
        """

        # first pull the local branches
        local_branches = self.repo.branches
        data = [[x.name, True] for x in list(local_branches)]

        # then the remotes
        remote_branches = self.repo.git.branch(all=True).split('\n')
        if sys.version_info.major == 2:
            remote_branches = set([x.split('/')[-1] for x in remote_branches if 'remotes' in x])
        else:
            remote_branches = {x.split('/')[-1] for x in remote_branches if 'remotes' in x}

        data += [[x, False] for x in remote_branches]

        df = DataFrame(data, columns=['branch', 'local'])
        df['repository'] = self._repo_name()

        return df

    def tags(self):
        """
        Returns a data frame of all tags in origin.  The DataFrame will have the columns:

         * repository
         * tag

        :returns: DataFrame
        """

        tags = self.repo.tags
        df = DataFrame([x.name for x in list(tags)], columns=['tag'])
        df['repository'] = self._repo_name()

        return df

    @property
    def repo_name(self):
        return self._repo_name()

    def _repo_name(self):
        """
        Returns the name of the repository, using the local directory name.

        :returns: str
        """

        if self._git_repo_name is not None:
            return self._git_repo_name
        else:
            reponame = self.repo.git_dir.split(os.sep)[-2]
            if reponame.strip() == '':
                return 'unknown_repo'
            return reponame

    def __str__(self):
        """
        A pretty name for the repository object.

        :returns: str
        """
        return 'git repository: %s at: %s' % (self._repo_name(), self.git_dir,)

    def __repr__(self):
        """
        A unique name for the repository object.

        :returns: str
        """
        return str(self.git_dir)

    def bus_factor(self, by='repository', ignore_globs=None, include_globs=None):
        """
        An experimental heuristic for truck factor of a repository calculated by the current distribution of blame in
        the repository's primary branch.  The factor is the fewest number of contributors whose contributions make up at
        least 50% of the codebase's LOC

        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param by: (optional, default=repository) whether to group by repository or by file
        :return:
        """

        if by == 'file':
            raise NotImplementedError('File-wise bus factor')

        blame = self.blame(include_globs=include_globs, ignore_globs=ignore_globs, by=by)
        blame = blame.sort_values(by=['loc'], ascending=False)

        total = blame['loc'].sum()
        cumulative = 0
        tc = 0
        for idx in range(blame.shape[0]):
            cumulative += blame.ix[idx, 'loc']
            tc += 1
            if cumulative >= total / 2:
                break

        return DataFrame([[self._repo_name(), tc]], columns=['repository', 'bus factor'])

    def file_owner(self, rev, filename, committer=True):
        """
        Returns the owner (by majority blame) of a given file in a given rev. Returns the committers' name.

        :param rev:
        :param filename:
        :param committer:
        """
        try:
            if committer:
                cm = 'committer'
            else:
                cm = 'author'

            blame = self.repo.blame(rev, os.path.join(self.git_dir, filename))
            blame = DataFrame([[x[0].committer.name, len(x[1])] for x in blame], columns=[cm, 'loc']).groupby(cm).agg(
                {'loc': np.sum})
            if blame.shape[0] > 0:
                return blame['loc'].idxmax()
            else:
                return None
        except (GitCommandError, KeyError):
            if self.verbose:
                print('Couldn\'t Calcualte File Owner for %s' % (rev,))
            return None

    def _file_last_edit(self, filename):
        """

        :param filename:
        :return:
        """

        tmp = self.repo.git.log('-n 1 -- %s' % (filename,)).split('\n')
        date_string = [x for x in tmp if x.startswith('Date:')]

        if len(date_string) > 0:
            return date_string[0].replace('Date:', '').strip()
        else:
            return None

    @multicache(
        key_prefix='file_detail',
        key_list=['include_globs', 'ignore_globs', 'rev', 'committer'],
        skip_if=lambda x: True if x.get('rev') is None or x.get('rev') == 'HEAD' else False
    )
    def file_detail(self, include_globs=None, ignore_globs=None, rev='HEAD', committer=True):
        """
        Returns a table of all current files in the repos, with some high level information about each file (total LOC,
        file owner, extension, most recent edit date, etc.).

        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param committer: (optional, default=True) true if committer should be reported, false if author
        :return:
        """

        # first get the blame
        blame = self.blame(
            include_globs=include_globs,
            ignore_globs=ignore_globs,
            rev=rev,
            committer=committer,
            by='file'
        )
        blame = blame.reset_index(level=1)
        blame = blame.reset_index(level=1)

        # reduce it to files and total LOC
        df = blame.reindex(columns=['file', 'loc'])
        df = df.groupby('file').agg({'loc': np.sum})
        df = df.reset_index(level=1)

        # map in file owners
        df['file_owner'] = df['file'].map(lambda x: self.file_owner(rev, x, committer=committer))

        # add extension (something like the language)
        df['ext'] = df['file'].map(lambda x: x.split('.')[-1])

        # add in last edit date for the file
        df['last_edit_date'] = df['file'].map(self._file_last_edit)
        df['last_edit_date'] = to_datetime(df['last_edit_date'])

        df = df.set_index('file')

        return df

    def punchcard(self, branch='master', limit=None, days=None, by=None, normalize=None, ignore_globs=None,
                  include_globs=None):
        """
        Returns a pandas DataFrame containing all of the data for a punchcard.

         * day_of_week
         * hour_of_day
         * author / committer
         * lines
         * insertions
         * deletions
         * net

        :param branch: the branch to return commits for
        :param limit: (optional, default=None) a maximum number of commits to return, None for no limit
        :param days: (optional, default=None) number of days to return, if limit is None
        :param by: (optional, default=None) agg by options, None for no aggregation (just a high level punchcard), or 'committer', 'author'
        :param normalize: (optional, default=None) if an integer, returns the data normalized to max value of that (for plotting)
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :return: DataFrame
        """

        ch = self.commit_history(
            branch=branch,
            limit=limit,
            days=days,
            ignore_globs=ignore_globs,
            include_globs=include_globs
        )

        # add in the date fields
        ch['day_of_week'] = ch.index.map(lambda x: x.weekday())
        ch['hour_of_day'] = ch.index.map(lambda x: x.hour)

        aggs = ['hour_of_day', 'day_of_week']
        if by is not None:
            aggs.append(by)

        punch_card = ch.groupby(aggs).agg({
            'lines': np.sum,
            'insertions': np.sum,
            'deletions': np.sum,
            'net': np.sum
        })
        punch_card.reset_index(inplace=True)

        # normalize all cols
        if normalize is not None:
            for col in ['lines', 'insertions', 'deletions', 'net']:
                punch_card[col] = (punch_card[col] / punch_card[col].sum()) * normalize

        return punch_card


class GitFlowRepository(Repository):
    """
    A special case where git flow is followed, so we know something about the branching scheme
    """

    def __init__(self):
        super(GitFlowRepository, self).__init__()