python source code of extraction

#!/usr/bin/python3

#################################################
# All functions that work on the git repository #
#################################################

import sqlite3
import os
from subprocess import check_output

import multiprocessing

import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy.stats import entropy

import pydriller as pydriller
from pydriller.git_repository import GitCommandError
from Levenshtein import distance as lev_dist
import datetime

import pathpy as pp
import re
import lizard
import sys
import collections

#from contextlib import closing

from git2net import __version__

import time
import threading

# thread_local = threading.local()
git_init_lock = multiprocessing.Lock()

#import stopit
try:
    import thread
except ImportError:
    import _thread as thread

class TimeoutException(Exception):   # Custom exception class
    pass
class Alarm(threading.Thread):
    def __init__(self, timeout):
        threading.Thread.__init__ (self)
        self.timeout = timeout
        self.setDaemon (True)

    def run(self):
        if self.timeout > 0:
            time.sleep (self.timeout)
            thread.interrupt_main()

import json
abs_path = os.path.dirname(__file__)
rel_path = 'helpers/binary-extensions/binary-extensions.json'
with open(os.path.join(abs_path, rel_path)) as json_file:
    binary_extensions = json.load(json_file)

def _get_block_length(lines, k):
    """ Calculates the length (in number of lines) of a edit of added/deleted lines starting in a
        given line k.

    Args:
        lines: dictionary of added or deleted lines
        k: line number to check for

    Returns:
        block_size: number of lines in the contiguously block that was modified
    """

    if k not in lines or (k > 1 and k - 1 in lines):
        edit = False
        block_size = 0
    else:
        edit = True
        block_size = 1

    while edit:
        if k + block_size in lines:
            block_size += 1
        else:
            edit = False
    return block_size


def _identify_edits(deleted_lines, added_lines, extraction_settings):
    """ Maps line numbers between the pre- and post-commit version of a modification.

    Args:
        deleted_lines: dictionary of deleted lines
        added_lines: dictionary of added lines
        extraction_settings: settings for the extraction

    Returns:
        pre_to_post: dictionary mapping line numbers before and after the commit
        edits: dataframe with information on edits
    """

    # either deleted or added lines must contain items otherwise there would not be a modification
    # to process
    if len(deleted_lines) > 0:
        max_deleted = max(deleted_lines.keys())
        min_deleted = min(deleted_lines.keys())
    else:
        max_deleted = -1
        min_deleted = np.inf

    if len(added_lines) > 0:
        max_added = max(added_lines.keys())
        min_added = min(added_lines.keys())
    else:
        max_added = -1
        min_added = np.inf

    # create mapping between pre and post edit line numbers
    pre_to_post = {}

    # create DataFrame holding information on edit
    edits = []

    # line numbers of lines before the first addition or deletion do not change
    pre = min(max(min_added, 0), max(min_deleted, 0))
    post = min(max(min_added, 0), max(min_deleted, 0))

    # counters used to match pre and post line number
    no_post_inc = 0
    both_inc = 0
    no_pre_inc = 0

    # line numbers after the last addition or deletion do not matter for edits
    while (pre <= max_deleted + 1) or (post <= max_added + 1):
        if extraction_settings['use_blocks']:
            # compute size of added and deleted edits
            # size is reported as 0 if the line is not in added or deleted lines, respectively
            length_added_block = _get_block_length(added_lines, post)
            length_deleted_block = _get_block_length(deleted_lines, pre)

            # replacement if both deleted and added > 0
            # if not both > 0, deletion if deleted > 0
            # if not both > 0, addition if added > 0
            if (length_deleted_block > 0) and (length_added_block > 0):
                edits.append({'pre_start': int(pre),
                                      'number_of_deleted_lines': int(length_deleted_block),
                                      'post_start': int(post),
                                      'number_of_added_lines': int(length_added_block),
                                      'type': 'replacement'})
                                    #  ignore_index=True, sort=False)
            elif length_deleted_block > 0:
                edits.append({'pre_start': int(pre),
                                      'number_of_deleted_lines': int(length_deleted_block),
                                      'post_start': int(post),
                                      'number_of_added_lines': int(length_added_block),
                                      'type': 'deletion'})
                                    #  ignore_index=True, sort=False)
            elif length_added_block > 0:
                edits.append({'pre_start': int(pre),
                                      'number_of_deleted_lines': int(length_deleted_block),
                                      'post_start': int(post),
                                      'number_of_added_lines': int(length_added_block),
                                      'type': 'addition'})
                                    #  ignore_index=True, sort=False)

            # deleted edit is larger than added edit
            if length_deleted_block > length_added_block:
                no_post_inc = length_deleted_block - length_added_block
                both_inc = length_added_block
            # added edit is larger than deleted edit
            elif length_added_block > length_deleted_block:
                no_pre_inc = length_added_block - length_deleted_block
                both_inc = length_deleted_block
        else: # no blocks are considered
            pre_in_deleted = pre in deleted_lines
            post_in_added = post in added_lines
            # cf. case of blocks above
            # length of blocks is equivalent to line being in added or deleted lines
            if pre_in_deleted and post_in_added:
                edits.append({'pre_start': int(pre),
                                      'number_of_deleted_lines': int(pre_in_deleted),
                                      'post_start': int(post),
                                      'number_of_added_lines': int(post_in_added),
                                      'type': 'replacement'})
                                    #  ignore_index=True, sort=False)
            elif pre_in_deleted and not post_in_added:
                edits.append({'pre_start': int(pre),
                                      'number_of_deleted_lines': int(pre_in_deleted),
                                      'post_start': None,
                                      'number_of_added_lines': None,
                                      'type': 'deletion'})
                                    #  ignore_index=True, sort=False)
                no_post_inc += 1
            elif post_in_added and not pre_in_deleted:
                edits.append({'pre_start': None,
                                      'number_of_deleted_lines': None,
                                      'post_start': int(post),
                                      'number_of_added_lines': int(post_in_added),
                                      'type': 'addition'})
                                    #  ignore_index=True, sort=False)
                no_pre_inc += 1

        # increment pre and post counter
        if both_inc > 0:
            both_inc -= 1
            pre_to_post[pre] = post
            pre += 1
            post += 1
        elif no_post_inc > 0:
            no_post_inc -= 1
            pre_to_post[pre] = False
            pre += 1
        elif no_pre_inc > 0:
            no_pre_inc -= 1
            post += 1
        else:
            pre_to_post[pre] = post
            pre += 1
            post += 1

    edits = pd.DataFrame(edits)
    return pre_to_post, edits


def text_entropy(text):
    """ Computes entropy for a given text based on UTF8 alphabet.

    Args:
        text: string to compute the text entropy for

    Returns:
        text_entropy: text entropy of the given string
    """
    # we only consider UTF8 characters to compute the text entropy
    pk = [text.count(chr(i)) for i in range(256)]
    if sum(pk) == 0:
        text_entropy = None
    else:
        text_entropy = entropy(pk, base=2)
    return text_entropy


def get_commit_dag(git_repo_dir):
    """ Extracts commit dag from given path to git repository.

    Args:
        git_repo_dir: path to the git repository that is mined

    Returns:
        dag: dag linking successive commits in the same branch
    """
    git_repo = pydriller.GitRepository(git_repo_dir)
    commits = [x.hash[0:7] for x in git_repo.get_list_commits()]
    dag = pp.DAG()
    for node in commits:
        for parent in git_repo.get_commit(node).parents:
            dag.add_edge(parent[0:7], node)
    return dag


def _parse_blame_C(blame_C):
    """ Converts the input provided for the copy option in git blame to a list of options required
        as input for gitpython.

    Args:
        blame_C: string defining how the copy option in git blame is used

    Returns:
        list_of_arguments: list of parameters for gitpython blame
    """
    pattern = re.compile("(^$|^-?C{0,3}[0-9]*$)")
    if not pattern.match(blame_C):
        raise Exception("Invalid 'blame_C' supplied.")
    if len(blame_C) == 0:
        list_of_arguments = []
    else:
        if blame_C[0] == '-':
            blame_C = blame_C[1:]
        cs = len(blame_C) - len(blame_C.lstrip('C'))
        num = blame_C.lstrip('C')
        list_of_arguments = ['-C' for i in range(cs - 1)] + ['-C{}'.format(num)]
    return list_of_arguments


def _parse_porcelain_blame(blame):
    """ Parses the porcelain output of git blame and returns content as dataframe.

    Args:
        blame: porcelain output of git blame

    Returns:
        blame_info: content of blame as pandas dataframe
    """
    l = {'original_commit_hash': [],
        'original_line_no': [],
        'original_file_path': [],
        'line_content': [],
        'line_number': []}
    start_of_line_info = True
    prefix = '\t'
    line_number = 1
    filename = '' # Initialise filename variable.
    for idx, line in enumerate(blame.split('\n')):
        if line.startswith(prefix):
            l['original_file_path'].append(filename)
            l['line_content'].append(line[len(prefix):])
            l['line_number'].append(line_number)
            line_number += 1
            start_of_line_info = True
        else:
            entries = line.split(' ')
            if start_of_line_info:
                l['original_commit_hash'].append(entries[0])
                l['original_line_no'].append(entries[1])
                start_of_line_info = False
            elif entries[0] == 'filename':
                filename = entries[1]
    blame_info = pd.DataFrame(l)
    return blame_info


def _get_edit_details(edit, commit, deleted_lines, added_lines, blame_info_parent,
                      blame_info_commit, extraction_settings):
    """ Extracts detailed measures for a given edit.

    Args:
        edit: edit as identified in _identify_edits
        commit: pydriller commit object containing the edit
        deleted_lines: dict of added lines
        added_lines: dict of deleted lines
        blame_info_parent: blame info for parent commit as output from _parse_porcelain_blame
        blame_info_commit: blame info for current commit as output from _parse_porcelain_blame
        extraction_settings: settings for the extraction

    Returns:
        e: pandas dataframe containing information on edits
    """
    # Different actions for different types of edits.
    e = {}
    if edit.type == 'replacement':
        # For replacements, both the content of the deleted and added block are required in
        # order to compute text entropy, as well as Levenshtein edit distance between them.
        deleted_block = []
        for i in range(int(edit.pre_start), int(edit.pre_start + edit.number_of_deleted_lines)):
            deleted_block.append(deleted_lines[i])

        added_block = []
        for i in range(int(edit.post_start), int(edit.post_start + edit.number_of_added_lines)):
            added_block.append(added_lines[i])

        # For the analysis, lines are concatenated with whitespaces.
        deleted_block = ' '.join(deleted_block)
        added_block = ' '.join(added_block)

        # Given this, all metadata can be written.
        # Data on the content and location of deleted line in the parent commit.
        e['pre_starting_line_no'] = int(edit.pre_start)
        e['pre_len_in_lines'] = int(edit.number_of_deleted_lines)
        e['pre_len_in_chars'] = len(deleted_block)
        e['pre_entropy'] = text_entropy(deleted_block)

        # Data on the content and location of added line in the current commit.
        e['post_starting_line_no'] = int(edit.post_start)
        e['post_len_in_lines'] = int(edit.number_of_added_lines)
        e['post_len_in_chars'] = len(added_block)
        e['post_entropy'] = text_entropy(added_block)

        # Levenshtein edit distance between deleted and added block.
        if extraction_settings['extract_text']:
            e['pre_text'] = deleted_block.encode('utf8','surrogateescape').decode('utf8','replace')
            e['post_text'] = added_block.encode('utf8','surrogateescape').decode('utf8','replace')
        e['levenshtein_dist'] = lev_dist(deleted_block, added_block)

        # Data on origin of deleted line. Every deleted line must have an origin
        if extraction_settings['use_blocks']:
            e['original_commit_deletion'] = 'not available with use_blocks'
            e['original_line_no_deletion'] = 'not available with use_blocks'
            e['original_file_path_deletion'] = 'not available with use_blocks'
        else:
            assert blame_info_parent is not None
            e['original_commit_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_commit_hash']
            e['original_line_no_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_line_no']
            e['original_file_path_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_file_path']

        # Data on the origin of added line. Can be either original or copied form other file.
        if extraction_settings['use_blocks']:
            e['original_commit_addition'] = 'not available with use_blocks'
            e['original_line_no_addition'] = 'not available with use_blocks'
            e['original_file_path_addition'] = 'not available with use_blocks'
        elif blame_info_commit.at[int(edit.post_start) - 1,
                                    'original_commit_hash'] == commit.hash:
            # The line is original, there exists no original commit, line number or file path.
            e['original_commit_addition'] = None
            e['original_line_no_addition'] = None
            e['original_file_path_addition'] = None
        else:
            # The line was copied from somewhere.
            assert blame_info_commit is not None
            e['original_commit_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_commit_hash']
            e['original_line_no_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_line_no']
            e['original_file_path_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_file_path']

    elif edit.type == 'deletion':
        # For deletions, only the content of the deleted block is required.
        deleted_block = []
        for i in range(int(edit.pre_start), int(edit.pre_start + edit.number_of_deleted_lines)):
            deleted_block.append(deleted_lines[i])

        deleted_block = ' '.join(deleted_block)

        # Given this, all metadata can be written.
        # Data on the deleted line in the parent commit.
        e['pre_starting_line_no'] = int(edit.pre_start)
        e['pre_len_in_lines'] = int(edit.number_of_deleted_lines)
        e['pre_len_in_chars'] = len(deleted_block)
        e['pre_entropy'] = text_entropy(deleted_block)

        # For deletions, there is no added line.
        e['post_starting_line_no'] = None
        e['post_len_in_lines'] = None
        e['post_len_in_chars'] = None
        e['post_entropy'] = None
        e['original_commit_addition'] = None
        e['original_line_no_addition'] = None
        e['original_file_path_addition'] = None

        # Levenshtein edit distance is set to 'None'. Theoretically 1 keystroke required.
        if extraction_settings['extract_text']:
            e['pre_text'] = deleted_block.encode('utf8','surrogateescape').decode('utf8','replace')
            e['post_text'] = None
        e['levenshtein_dist'] = None

        # Data on origin of deleted line. Every deleted line must have an origin.
        if extraction_settings['use_blocks']:
            e['original_commit_deletion'] = 'not available with use_blocks'
            e['original_line_no_deletion'] = 'not available with use_blocks'
            e['original_file_path_deletion'] = 'not available with use_blocks'
        else:
            assert blame_info_parent is not None
            e['original_commit_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_commit_hash']
            e['original_line_no_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_line_no']
            e['original_file_path_deletion'] = blame_info_parent.at[int(edit.pre_start) - 1,
                                                                    'original_file_path']

    elif edit.type == 'addition':
        # For additions, only the content of the added block is required.
        added_block = []
        for i in range(int(edit.post_start), int(edit.post_start + edit.number_of_added_lines)):
            added_block.append(added_lines[i])

        added_block = ' '.join(added_block)

        # Given this, all metadata can be written.
        # For additions, there is no deleted line.
        e['pre_starting_line_no'] = None
        e['pre_len_in_lines'] = None
        e['pre_len_in_chars'] = None
        e['pre_entropy'] = None
        e['original_commit_deletion'] = None
        e['original_line_no_deletion'] = None
        e['original_file_path_deletion'] = None

        # Data on the added line.
        e['post_starting_line_no'] = int(edit.post_start)
        e['post_len_in_lines'] = int(edit.number_of_added_lines)
        e['post_len_in_chars'] = len(added_block)
        e['post_entropy'] = text_entropy(added_block)

        # Levenshtein edit distance is length of added block as nothing existed before.
        if extraction_settings['extract_text']:
            e['pre_text'] = None
            e['post_text'] = added_block.encode('utf8','surrogateescape').decode('utf8','replace')
        e['levenshtein_dist'] = len(added_block)

        # If the lines were newly added to this file, they might still come from another file.
        if extraction_settings['use_blocks']:
            e['original_commit_addition'] = 'not available with use_blocks'
            e['original_line_no_addition'] = 'not available with use_blocks'
            e['original_file_path_addition'] = 'not available with use_blocks'
        elif blame_info_commit.at[int(edit.post_start) - 1,
                                    'original_commit_hash'] == commit.hash:
            # The line is original, there exists no original commit, line number or file path.
            e['original_commit_addition'] = None
            e['original_line_no_addition'] = None
            e['original_file_path_addition'] = None
        else:
            # The line was copied from somewhere.
            assert blame_info_commit is not None
            e['original_commit_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_commit_hash']
            e['original_line_no_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_line_no']
            e['original_file_path_addition'] = blame_info_commit.at[int(edit.post_start) - 1,
                                                                    'original_file_path']

    elif (edit.type == 'file_renaming') or (edit.type == 'binary_file_change'):
        # For file renaming only old and new path are required which were already set before.
        e['pre_starting_line_no'] = None
        e['pre_len_in_lines'] = None
        e['pre_len_in_chars'] = None
        e['pre_entropy'] = None
        e['post_starting_line_no'] = None
        e['post_len_in_lines'] = None
        e['post_len_in_chars'] = None
        e['post_entropy'] = None

        if extraction_settings['use_blocks']:
            e['original_commit_deletion'] = 'not available with use_blocks'
            e['original_line_no_deletion'] = 'not available with use_blocks'
            e['original_file_path_deletion'] = 'not available with use_blocks'
            e['original_commit_addition'] = 'not available with use_blocks'
            e['original_line_no_addition'] = 'not available with use_blocks'
            e['original_file_path_addition'] = 'not available with use_blocks'
        else:
            e['original_commit_deletion'] = None
            e['original_line_no_deletion'] = None
            e['original_file_path_deletion'] = None
            e['original_commit_addition'] = None
            e['original_line_no_addition'] = None
            e['original_file_path_addition'] = None

        # Levenshtein edit distance set to 0 to distinguish from deletion
        if extraction_settings['extract_text']:
            e['pre_text'] = None
            e['post_text'] = None
        e['levenshtein_dist'] = 0

    else:
        print(edit.type)
        raise Exception("Unexpected error in '_get_edit_details'.")

    return e


def is_binary_file(filename, file_content):
    if filename is None:
        return False
    else:
        try:
            extension = re.search(r'.*\.([^\.]+)$', filename).groups()[0]
        except AttributeError:
            extension = None

        if extension in binary_extensions:
            return True
        else:
            try:
                file_content.encode('utf-8', errors='strict')
            except UnicodeEncodeError:
                return True
            else:
                return False


def _extract_edits(git_repo, commit, modification, extraction_settings):
    """ Returns dataframe with metadata on edits made in a given modification.

    Args:
        git_repo: pydriller GitRepository object
        commit: pydriller Commit object
        modification: pydriller Modification object
        extraction_settings: settings for the extraction

    Returns:
        edits_info: pandas DataFrame object containing metadata on all edits in given modification
    """

    binary_file = is_binary_file(modification.filename, modification.diff)
    found_paths = False

    if not binary_file:
        try:
            old_path, new_path = re.search(r'Binary files a?\/(.*) and b?\/(.*) differ',
                                            modification.diff.strip()).groups()

            if old_path == 'dev/null':
                old_path = None
            if new_path == 'dev/null':
                new_path = None

            found_paths = True
            binary_file = True
        except AttributeError:
            pass

    if binary_file:
        if found_paths:
            edits = pd.DataFrame({'pre_start': None,
                                  'number_of_deleted_lines': None,
                                  'post_start': None,
                                  'number_of_added_lines': None,
                                  'type': 'binary_file_change',
                                  'new_path': new_path,
                                  'old_path': old_path}, index=[0])
        else:
            edits = pd.DataFrame({'pre_start': None,
                                  'number_of_deleted_lines': None,
                                  'post_start': None,
                                  'number_of_added_lines': None,
                                  'type': 'binary_file_change',
                                  'new_path': modification.new_path,
                                  'old_path': modification.old_path}, index=[0])
        deleted_lines = {}
        added_lines = {}
    else:
        # Parse diff of given modification to extract added and deleted lines
        parsed_lines = modification.diff_parsed

        deleted_lines = { x[0]:x[1] for x in parsed_lines['deleted'] }
        added_lines = { x[0]:x[1] for x in parsed_lines['added'] }

        # If there was a modification but no lines were added or removed, the file was renamed.
        if (len(deleted_lines) == 0) and (len(added_lines) == 0):
            edits = pd.DataFrame({'pre_start': None,
                                'number_of_deleted_lines': None,
                                'post_start': None,
                                'number_of_added_lines': None,
                                'type': 'file_renaming'}, index=[0])
        else: # If there were lines added or deleted, the specific edits are identified.
            _, edits = _identify_edits(deleted_lines, added_lines, extraction_settings)

    # In order to trace the origins of lines e execute git blame is executed. For lines that were
    # deleted with the current commit, the blame needs to be executed on the parent commit. As
    # merges are treated separately, commits should only have one parent. For added lines, git blame
    # is executed on the current commit.
    blame_info_parent = None
    blame_info_commit = None

    try:
        if not binary_file:
            if len(deleted_lines) > 0:
                assert len(commit.parents) == 1
                blame_parent = git_repo.git.blame(commit.parents[0],
                                                _parse_blame_C(extraction_settings['blame_C']) +
                                                ['--show-number', '--line-porcelain'],
                                                modification.old_path)
                blame_info_parent = _parse_porcelain_blame(blame_parent)

            if len(added_lines) > 0:
                blame_commit = git_repo.git.blame(commit.hash,
                                                _parse_blame_C(extraction_settings['blame_C']) +
                                                ['--show-number', '--line-porcelain'],
                                                modification.new_path)
                blame_info_commit = _parse_porcelain_blame(blame_commit)

    except GitCommandError:
        return pd.DataFrame()
    else:
        # Next, metadata on all identified edits is extracted and added to a pandas DataFrame.
        l = []
        for _, edit in edits.iterrows():
            e = {}
            # Extract general information.
            if edit.type == 'binary_file_change':
                e['new_path'] = edit.new_path
                e['old_path'] = edit.old_path
                if extraction_settings['extract_complexity']:
                    e['cyclomatic_complexity_of_file'] = None
                    e['lines_of_code_in_file'] = None
                e['total_added_lines'] = None
                e['total_removed_lines'] = None
            else:
                e['new_path'] = modification.new_path
                e['old_path'] = modification.old_path
                if extraction_settings['extract_complexity']:
                    e['cyclomatic_complexity_of_file'] = modification.complexity
                    e['lines_of_code_in_file'] = modification.nloc
                e['total_added_lines'] = modification.added
                e['total_removed_lines'] = modification.removed
            e['filename'] = modification.filename
            e['commit_hash'] = commit.hash
            e['modification_type'] = modification.change_type.name
            e['edit_type'] = edit.type

            e.update(_get_edit_details(edit, commit, deleted_lines, added_lines, blame_info_parent,
                                      blame_info_commit, extraction_settings))

            l.append(e)

        edits_info = pd.DataFrame(l)
        return edits_info


def _extract_edits_merge(git_repo, commit, modification_info, extraction_settings):
    """ Returns dataframe with metadata on edits made in a given modification for merge commits.

    Args:
        git_repo: pydriller GitRepository object
        commit: pydriller Commit object
        modification_info: information on the modification as stored in a pydriller Modification.
        extraction_settings: settings for the extraction

    Returns:
        edits_info: pandas DataFrame object containing metadata on all edits in given modification
    """
    assert commit.merge
    # With merges, the following cases can occur:
    #   1. Changes of one or more parents are accepted.
    #   2. Changes made prior to the merge are replaced with new edits.
    # To obtain the state of the file before merging, get blame is executed on all parent commits.
    try:
        file_content = git_repo.git.show('{}:{}'.format(commit.hash, modification_info['new_path']))
    except GitCommandError:
        file_content = ''

    file_content_parents = []
    for parent in commit.parents:
        try:
            file_content_parents.append(git_repo.git.show('{}:{}'.format(parent,
                                                            modification_info['old_path'])))
        except GitCommandError:
            file_content_parents.append('')

    binary_file = is_binary_file(modification_info['new_path'], file_content)
    if not binary_file:
        for file_content_parent in file_content_parents:
            if is_binary_file(modification_info['new_path'], file_content_parent):
                binary_file = True
                break

    if binary_file:
        blame_info_parent = None
        blame_info_commit = None
        added_lines = []
        deleted_lines = []

        edits = pd.DataFrame({'pre_start': None,
                             'number_of_deleted_lines': None,
                             'post_start': None,
                             'number_of_added_lines': None,
                             'type': 'binary_file_change'}, index=[0])

        edits_info = pd.DataFrame()
        for _, edit in edits.iterrows():
            e = {}
            e['commit_hash'] = commit.hash
            e['edit_type'] = edit.type
            e['total_added_lines'] = None
            e['total_removed_lines'] = None
            e.update(modification_info)

            e.update(_get_edit_details(edit, commit, deleted_lines, added_lines, blame_info_parent,
                                          blame_info_commit, extraction_settings))



            edits_info = edits_info.append(e, ignore_index=True, sort=False)
        return edits_info
    else:
        parent_blames = []
        for parent in commit.parents:
            try:
                parent_blame = git_repo.git.blame(parent,
                                                  _parse_blame_C(extraction_settings['blame_C']) +
                                                        ['--show-number', '--line-porcelain'],
                                                  modification_info['old_path'])

                if len(parent_blame) > 0:
                    parent_blame = _parse_porcelain_blame(parent_blame).rename(
                                    columns={'line_content': 'pre_line_content',
                                            'line_number': 'pre_line_number'})
                    parent_blame.loc[:, 'pre_commit'] = parent
                else:
                    parent_blame = pd.DataFrame({'original_commit_hash': [],
                                                'original_line_no': [],
                                                'original_file_path': [],
                                                'pre_line_content': [],
                                                'pre_line_number': [],
                                                'pre_commit': []})
            except GitCommandError:
                parent_blame = pd.DataFrame({'original_commit_hash': [],
                                            'original_line_no': [],
                                            'original_file_path': [],
                                            'pre_line_content': [],
                                            'pre_line_number': [],
                                            'pre_commit': []})
            parent_blames.append(parent_blame)

    # Then, the current state of the file is obtained by executing git blame on the current commit.
    try:
        current_blame = git_repo.git.blame(commit.hash,
                                           _parse_blame_C(extraction_settings['blame_C']) +
                                                ['--show-number', '--line-porcelain'],
                                           modification_info['new_path'])

        if len(current_blame) > 0:
            current_blame = _parse_porcelain_blame(current_blame).rename(
                                                       columns={'line_content': 'post_line_content',
                                                                'line_number': 'post_line_number'})
        else:
            current_blame = pd.DataFrame({'original_commit_hash': [],
                                      'original_line_no': [],
                                      'original_file_path': [],
                                      'post_line_content': [],
                                      'post_line_number': []})
    except GitCommandError:
        current_blame = pd.DataFrame({'original_commit_hash': [],
                                      'original_line_no': [],
                                      'original_file_path': [],
                                      'post_line_content': [],
                                      'post_line_number': []})

    # Define columns that are considered when identifying duplicates.
    comp_cols = ['original_commit_hash', 'original_line_no', 'original_file_path']

    for idx, parent_blame in enumerate(parent_blames):
        parent_blames[idx]['_count'] = parent_blame.groupby(comp_cols).cumcount()
    current_blame['_count'] = current_blame.groupby(comp_cols).cumcount()

    deletions = []
    additions = []
    for parent_blame in parent_blames:
        comp = parent_blame.merge(current_blame, on=comp_cols+['_count'],
                                  how='outer', indicator=True)
        comp['_action'] = np.nan

        comp.loc[comp['_merge']=='both', '_action'] = 'accepted'
        comp.loc[comp['_merge']=='right_only', '_action'] = 'added'
        comp.loc[comp['_merge']=='left_only', '_action'] = 'deleted'

        assert comp['_action'].isnull().any() == False

        drop_cols = ['_count', '_merge', '_action']

        added = comp.loc[comp['_action']=='added'].drop(drop_cols, axis=1)
        deleted = comp.loc[comp['_action']=='deleted'].drop(drop_cols, axis=1)

        additions.append(added)
        deletions.append(deleted)

    added_lines_counter = collections.Counter()
    for added in additions:
        for _, x in added.iterrows():
            added_lines_counter[(x.post_line_number, x.post_line_content)] += 1

    added_lines = {k[0]: k[1] for k, v in added_lines_counter.items() if v == len(commit.parents)}

    deleted_lines_parents = []
    for deleted in deletions:
        deleted_lines_parents.append({x.pre_line_number: x.pre_line_content
                                      for _, x in deleted.iterrows()})

    matches = []
    for k, v in added_lines.items():
        for idx, deleted_lines in enumerate(deleted_lines_parents):
            if k in deleted_lines and v == deleted_lines[k]:
                del deleted_lines_parents[idx][k]
                matches.append(k)
    for k in set(matches):
        del added_lines[k]

    edits_info = []

    edits_parents = []
    for deleted_lines in deleted_lines_parents:
        _, edits = _identify_edits(deleted_lines, added_lines, extraction_settings)
        edits_parents.append(edits)

    for idx, edits in enumerate(edits_parents):
        for _, edit in edits.iterrows():

            # extract edit details for all edits if merge deletions are extracted
            # or the edit type is not a deletion
            if extraction_settings['extract_merge_deletions'] or (edit.type != 'deletion'):
                e = {}
                # Extract general information.
                e['commit_hash'] = commit.hash
                e['edit_type'] = edit.type
                e.update(modification_info)
                e.update(_get_edit_details(edit, commit, deleted_lines_parents[idx], added_lines,
                                           parent_blames[idx], current_blame, extraction_settings))

                edits_info.append(e)

    return pd.DataFrame(edits_info)


def _get_edited_file_paths_since_split(git_repo, commit):
    """ For a merge commit returns list of all files edited since the last creation of a new branch
        relevant for the merge.

    Args:
        git_repo: pydriller GitRepository object
        commit: pydriller Commit object

    Returns:
        edited_file_paths: list of paths to the edited files
    """
    def expand_dag(dag, leafs):
        """ Expands a dag by adding the parents of a given set of nodes to the dag.

        Args:
            dag: pathpy DAG object
            leafs: set of nodes that are expanded

        Returns:
            dag: the expanded pathpy DAG object
        """
        for node in leafs:
            parents = git_repo.get_commit(node).parents
            for parent in parents:
                dag.add_edge(node, parent)
        return dag
    def common_node_on_paths(paths):
        """ Computes the overlap between given sets of nodes. Returns the nodes present in all sets.

        Args:
            paths: list of node sequences

        Returns:
            common_nodes: set of nodes that are present on all paths
        """
        # Drop first and last element of the path.
        common_nodes = set(paths[0][1:-1])
        for path in paths[1:]:
            common_nodes.intersection_update(path[1:-1])
        common_nodes = list(common_nodes)
        return common_nodes

    def remove_successors(dag, node):
        """ Removes all successors of a node from a given dag.

        Args:
            dag: pathpy DAG object
            node: node for which successors shall be removed

        Returns:
            dag: reduced pathpy DAG object
        """
        rm = [n for nl in [x[1:] for x in dag.routes_from_node(node)] for n in nl]
        for node in rm:
            dag.remove_node(node)
        return dag

    dag = pp.DAG()
    dag.add_node(commit.hash)

    leafs = list(dag.nodes)

    cont = True
    while cont:
        dag = expand_dag(dag, leafs)
        leafs = [node for node in dag.nodes if len(dag.successors[node]) == 0]

        paths = [p for pl in [dag.routes_to_node(node) for node in leafs] for p in pl]
        common_nodes = common_node_on_paths(paths)

        if (len(leafs) == 1) or (len(common_nodes) > 0):
            cont = False

    for node in common_nodes:
        dag = remove_successors(dag, node)

    edited_file_paths = []
    for node in dag.nodes:
        edited_file_paths += [modification.new_path for modification
                              in git_repo.get_commit(node).modifications]
        edited_file_paths += [modification.old_path for modification
                              in git_repo.get_commit(node).modifications]

    edited_file_paths = set(edited_file_paths)
    if None in edited_file_paths:
        edited_file_paths.remove(None)

    return edited_file_paths


def _process_commit(args):
    """ Extracts information on commit and all edits made with the commit.

    Args:
        args: dictionary with arguments. For multiprocessing, function can only take single input.
              Dictionary must contain:
                  git_repo_dir: path to the git repository that is mined
                  commit_hash: hash of the commit that is processed
                  extraction_settings: settings for the extraction

    Returns:
        extracted_result: dict containing two dataframes with information of commit and edits
    """
    with git_init_lock:
        git_repo = pydriller.GitRepository(args['git_repo_dir'])
        commit = git_repo.get_commit(args['commit_hash'])

    alarm = Alarm(args['extraction_settings']['timeout'])
    alarm.start()

    try:
        # parse commit
        c = {}
        c['hash'] = commit.hash
        c['author_email'] = commit.author.email
        c['author_name'] = commit.author.name
        c['committer_email'] = commit.committer.email
        c['committer_name'] = commit.committer.name
        c['author_date'] = commit.author_date.strftime('%Y-%m-%d %H:%M:%S')
        c['committer_date'] = commit.committer_date.strftime('%Y-%m-%d %H:%M:%S')
        c['author_timezone'] = commit.author_timezone
        c['committer_timezone'] = commit.committer_timezone
        c['no_of_modifications'] = len(commit.modifications)
        c['commit_message_len'] = len(commit.msg)
        if args['extraction_settings']['extract_text']:
            c['commit_message'] = commit.msg.encode('utf8','surrogateescape').decode('utf8','replace')
        c['project_name'] = commit.project_name
        c['parents'] = ','.join(commit.parents)
        c['merge'] = commit.merge
        c['in_main_branch'] = commit.in_main_branch
        c['branches'] = ','.join(commit.branches)

        # parse modification
        df_edits = pd.DataFrame()
        if commit.merge and args['extraction_settings']['extract_merges']:
            # Git does not create a modification if own changes are accpeted during a merge.
            # Therefore, the edited files are extracted manually.
            edited_file_paths = {f for p in commit.parents for f in
                                 git_repo.git.diff(commit.hash, p, '--name-only').split('\n')}

            if (args['extraction_settings']['max_modifications'] > 0) and \
               (len(edited_file_paths) > args['extraction_settings']['max_modifications']):
                print('Commit exceeding max_modifications: ', commit.hash)
                extracted_result = {'commit': pd.DataFrame(), 'edits': pd.DataFrame()}
                return extracted_result

            for edited_file_path in edited_file_paths:
                exclude_file = False
                for x in args['extraction_settings']['exclude']:
                    if edited_file_path.startswith(x + os.sep) or (edited_file_path == x):
                        exclude_file = True
                if not exclude_file:
                    modification_info = {}
                    try:
                        file_content = git_repo.git.show('{}:{}'.format(commit.hash,
                                                                edited_file_path))

                        if is_binary_file(edited_file_path, file_content):
                            if args['extraction_settings']['extract_complexity']:
                                modification_info['cyclomatic_complexity_of_file'] = None
                                modification_info['lines_of_code_in_file'] = None
                        else:
                            if args['extraction_settings']['extract_complexity']:
                                l = lizard.analyze_file.analyze_source_code(edited_file_path,
                                                                            file_content)
                                modification_info['cyclomatic_complexity_of_file'] = l.CCN
                                modification_info['lines_of_code_in_file'] = l.nloc
                        modification_info['filename'] = edited_file_path.split(os.sep)[-1]
                        modification_info['new_path'] = edited_file_path
                        modification_info['old_path'] = edited_file_path
                        modification_info['modification_type'] = 'merge_self_accept'

                        df_edits = df_edits.append(_extract_edits_merge(git_repo, commit,
                                                        modification_info,
                                                        args['extraction_settings']),
                                                ignore_index=True, sort=True)
                    except GitCommandError:
                        # A GitCommandError occurs if the file was deleted. In this case it
                        # currently has no content.

                        # Get filenames from all modifications in merge commit.
                        paths = [m.old_path for m in commit.modifications]

                        # Analyse changes if modification was recorded. Else, the deletions were
                        # made before the merge.
                        if edited_file_path in paths:
                            modification_info['filename'] = edited_file_path.split(os.sep)[-1]
                            modification_info['new_path'] = None # File was deleted.
                            modification_info['old_path'] = edited_file_path
                            if args['extraction_settings']['extract_complexity']:
                                modification_info['cyclomatic_complexity_of_file'] = 0
                                modification_info['lines_of_code_in_file'] = 0
                            modification_info['modification_type'] = 'merge_self_accept'

                            df_edits = df_edits.append(_extract_edits_merge(git_repo, commit,
                                                        modification_info,
                                                        args['extraction_settings']),
                                                    ignore_index=True, sort=True)

        else:
            if (args['extraction_settings']['max_modifications'] > 0) and \
               (len(commit.modifications) > args['extraction_settings']['max_modifications']):
                print('Commit exceeding max_modifications: ', commit.hash)
                extracted_result = {'commit': pd.DataFrame(), 'edits': pd.DataFrame()}
                return extracted_result

            for modification in commit.modifications:
                exclude_file = False
                for x in args['extraction_settings']['exclude']:
                    if modification.new_path:
                        if modification.new_path.startswith(x + os.sep) or \
                           (modification.new_path == x):
                            exclude_file = True
                    if not exclude_file and modification.old_path:
                        if modification.old_path.startswith(x + os.sep):
                            exclude_file = True
                if not exclude_file:
                    df_edits = df_edits.append(_extract_edits(git_repo, commit, modification,
                                                              args['extraction_settings']),
                                            ignore_index=True, sort=True)


        df_commit = pd.DataFrame(c, index=[0])

        extracted_result = {'commit': df_commit, 'edits': df_edits}
    except KeyboardInterrupt:
        print('Timeout processing commit: ', commit.hash)
        extracted_result = {'commit': pd.DataFrame(), 'edits': pd.DataFrame()}

    del alarm

    return extracted_result


def _process_repo_serial(git_repo_dir, sqlite_db_file, commits, extraction_settings):
    """ Processes all commits in a given git repository in a serial manner.

    Args:
        git_repo_dir: path to the git repository that is mined
        sqlite_db_file: path (including database name) where the sqlite database will be created
        commits: list of commits that have to be processed
        extraction_settings: settings for the extraction

    Returns:
        sqlite database will be written at specified location
    """

    git_repo = pydriller.GitRepository(git_repo_dir)

    con = sqlite3.connect(sqlite_db_file)

    for commit in tqdm(commits, desc='Serial'):
        args = {'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings}
        result = _process_commit(args)

        if not result['edits'].empty:
            result['edits'].to_sql('edits', con, if_exists='append', index=False)
        if not result['commit'].empty:
            result['commit'].to_sql('commits', con, if_exists='append', index=False)


def _process_repo_parallel(git_repo_dir, sqlite_db_file, commits, extraction_settings):
    """ Processes all commits in a given git repository in a parallel manner.

    Args:
        git_repo_dir: path to the git repository that is mined
        sqlite_db_file: path (including database name) where the sqlite database will be created
        commits: list of commits that are already in the database
        extraction_settings: settings for the extraction

    Returns:
        sqlite database will be written at specified location
    """

    args = [{'git_repo_dir': git_repo_dir, 'commit_hash': commit.hash, 'extraction_settings': extraction_settings}
            for commit in commits]

    # suggestion by marco-c (github.com/ishepard/pydriller/issues/110)
    def _init(git_repo_dir, git_init_lock_):
        global git_init_lock
        git_init_lock = git_init_lock_

    con = sqlite3.connect(sqlite_db_file)
    with multiprocessing.Pool(extraction_settings['no_of_processes'],
                              initializer=_init, initargs=(git_repo_dir,git_init_lock)) as p:
        with tqdm(total=len(args), desc='Parallel ({0} processes)' \
                  .format(extraction_settings['no_of_processes'])) as pbar:
            for result in p.imap_unordered(_process_commit, args, chunksize=extraction_settings['chunksize']):
                if not result['edits'].empty:
                    result['edits'].to_sql('edits', con, if_exists='append', index=False)
                if not result['commit'].empty:
                    result['commit'].to_sql('commits', con, if_exists='append', index=False)
                pbar.update(1)


def identify_file_renaming(git_repo_dir):
    """ Identifies all names and locations different files in a repository have had.

    Args:
        git_repo_dir: path to the git repository that is mined

    Returns:
        dag: pathpy DAG object depicting the renaming process
        aliases: dictionary containing all aliases for all files
    """

    # TODO: Consider corner case where file is renamed and new file with old name is created.
    git_repo = pydriller.GitRepository(git_repo_dir)

    dag = pp.DAG()
    for commit in tqdm(list(git_repo.get_list_commits()), desc='Creating DAG'):
        for modification in commit.modifications:

            if (modification.new_path not in dag.nodes) and \
               (modification.old_path == modification.new_path) and \
               (modification.change_type == pydriller.domain.commit.ModificationType.ADD):
                if modification.new_path not in dag.nodes:
                        dag.add_node(modification.new_path)
            elif modification.old_path != modification.new_path:
                if pd.isnull(modification.old_path):
                    if modification.new_path not in dag.nodes:
                        dag.add_node(modification.new_path)
                elif pd.isnull(modification.new_path):
                    pass
                else:
                    dag.add_edge(modification.new_path, modification.old_path)

    dag.make_acyclic()
    nodes = [k for k, v in dag.nodes.items() if v['indegree'] == 0 and not v['outdegree'] == 0]
    aliases = {z: y[-1] for x in nodes for y in dag.routes_from_node(x) for z in y[:-1]}

    return dag, aliases


def get_unified_changes(git_repo_dir, commit_hash, file_path):
    """ Returns dataframe with github-like unified diff representation of the content of a file
        before and after a commit for a given git repository, commit hash and file path.

    Args:
        git_repo_dir: path to the git repository that is mined
        commit_hash: commit hash for which the changes are computed
        file_path: path to file (within the repository) for which the changes are computed

    Returns:
        df: pandas dataframe listing changes made to file in commit
    """
    git_repo = pydriller.GitRepository(git_repo_dir)
    commit = git_repo.get_commit(commit_hash)

    # Select the correct modifictaion.
    for modification in commit.modifications:
        if modification.new_path == file_path:
            break

    # Parse the diff extracting the lines added and deleted with the given commit.
    parsed_lines = modification.diff_parsed

    deleted_lines = { x[0]:x[1] for x in parsed_lines['deleted'] }
    added_lines = { x[0]:x[1] for x in parsed_lines['added'] }

    # Indetify the edits made with the changes.
    pre_to_post, edits = _identify_edits(deleted_lines, added_lines, {'use_blocks': False})

    # Extract the source code after the commit.
    post_source_code = modification.source_code.split('\n')

    # Initialise lists for output.
    pre_line_no = []
    post_line_no = []
    action = []
    code = []

    # Go through all lines and report on the changes.
    pre_counter = 1
    post_counter = 1
    while post_counter < len(post_source_code) or \
          pre_counter < max(deleted_lines.keys()) or \
          post_counter < max(added_lines.keys()):
        if pre_counter in list(edits.pre_start):
            pre_line_no.append(pre_counter)
            post_line_no.append(None)
            action.append('-')
            code.append(deleted_lines[pre_counter])
            pre_counter += 1
        elif post_counter in list(edits.post_start):
            pre_line_no.append(None)
            post_line_no.append(post_counter)
            action.append('+')
            code.append(added_lines[post_counter])
            post_counter += 1
        else:
            pre_line_no.append(pre_counter)
            post_line_no.append(post_counter)
            action.append(None)
            code.append(post_source_code[post_counter - 1])
            pre_counter += 1
            post_counter += 1

    df = pd.DataFrame({'pre': pre_line_no, 'post': post_line_no, 'action': action, 'code': code})

    return df

def mining_state_summary(git_repo_dir, sqlite_db_file):
    """ Prints mining progress of database and returns dataframe with details on missing commits.

    Args:
        git_repo_dir: path to the git repository that is mined

    Returns:
        dataframe with details on missing commits
    """
    git_repo = pydriller.GitRepository(git_repo_dir)
    if os.path.exists(sqlite_db_file):
        try:
            with sqlite3.connect(sqlite_db_file) as con:
                try:
                    p_commits = set(x[0] for x in
                        con.execute("SELECT hash FROM commits").fetchall())
                except sqlite3.OperationalError:
                    p_commits = set()
        except sqlite3.OperationalError:
            raise Exception("The provided file is not a compatible database.")
    else:
        raise Exception("Found no database at provided path.")

    commits = [c for c in git_repo.get_list_commits()]
    if not p_commits.issubset({c.hash for c in commits}):
        raise Exception("The database does not match the provided repository.")

    no_of_commits = len({c.hash for c in commits})
    print('{} / {} ({:.2f}%) of commits were successfully mined.'.format(
            len(p_commits), no_of_commits, len(p_commits) / no_of_commits * 100))

    u_commits = [c for c in commits if c.hash not in p_commits]

    u_commit_info = {'hash': [],
                     'is_merge': [],
                     'modifications': [],
                     'author_name': [],
                     'author_email': [],
                     'author_date': []}
    for c in tqdm(u_commits):
        u_commit_info['hash'].append(c.hash)
        try:
            u_commit_info['is_merge'].append(c.merge)
        except:
            print('Error reading "merge" for', c.hash)
            u_commit_info['is_merge'].append(None)

        if c.merge:
            u_commit_info['modifications'].append(len({f for p in c.parents for f in
                                        git_repo.git.diff(c.hash, p, '--name-only').split('\n')}))
            #print(c.modifications)
        else:
            u_commit_info['modifications'].append(len(c.modifications))

        try:
            u_commit_info['author_name'].append(c.author.name)
        except:
            print('Error reading "author.name" for', c.hash)
            u_commit_info['author_name'].append(None)

        try:
            u_commit_info['author_email'].append(c.author.email)
        except:
            print('Error reading "author.email" for', c.hash)
            u_commit_info['author_email'].append(None)

        try:
            u_commit_info['author_date'].append(c.author_date.strftime('%Y-%m-%d %H:%M:%S'))
        except:
            print('Error reading "author_date" for', c.hash)
            u_commit_info['author_date'].append(None)

    u_commits_info = pd.DataFrame(u_commit_info)

    return u_commits_info

def mine_git_repo(git_repo_dir, sqlite_db_file, commits=[],
                  use_blocks=False, no_of_processes=os.cpu_count(), chunksize=1, exclude=[],
                  blame_C='', max_modifications=0, timeout=0, extract_text=False,
                  extract_complexity=False, extract_merges=True, extract_merge_deletions=False):
    """ Creates sqlite database with details on commits and edits for a given git repository.

    Args:
        git_repo_dir: path to the git repository that is mined
        sqlite_db_file: path (including database name) where the sqlite database will be created
        commits: only consider specific set of commits, considers all if empty
        use_blocks: bool, determins if analysis is performed on block or line basis
        no_of_processes: number of parallel processes that are spawned
        chunksize: number of tasks that are assigned to a process at a time
        exclude: file paths that are excluded from the analysis
        blame_C: string for the blame C option following the pattern "-C[<num>]" (computationally expensive)
        max_modifications: ignore commit if there are more modifications
        timeout: stop processing commit after given time in seconds
        extract_text: extract the commit message and line texts
        extract_complexity: extract cyclomatic complexity and length of file (computationally expensive)
        extract_merges: process merges
        extract_merge_deletions: extract lines that are not accepted during a merge as 'deletions'

    Returns:
        sqlite database will be written at specified location
    """
    git_version = check_output(['git', '--version']).strip().split()[-1].decode("utf-8")

    if int(re.search(r'(\d+)(?:\.\d+[a-z]*)+', git_version).groups()[0]) < 2:
        raise Exception("Your system is using git " + git_version + " which is not supported by " +
                        "git2net. Please update to git >= 2.0.")

    extraction_settings = {'use_blocks': use_blocks,
                           'no_of_processes': no_of_processes,
                           'chunksize': chunksize,
                           'exclude': exclude,
                           'blame_C': blame_C,
                           'max_modifications': max_modifications,
                           'timeout': timeout,
                           'extract_text': extract_text,
                           'extract_complexity': extract_complexity,
                           'extract_merges': extract_merges,
                           'extract_merge_deletions': extract_merge_deletions}

    git_repo = pydriller.GitRepository(git_repo_dir)
    if os.path.exists(sqlite_db_file):
        try:
            with sqlite3.connect(sqlite_db_file) as con:
                prev_method, prev_repository, prev_extract_text = con.execute(
                                                           """SELECT
                                                                  method,
                                                                  repository,
                                                                  extract_text
                                                              FROM _metadata""").fetchall()[0]

                if (prev_method == 'blocks' if use_blocks else 'lines') and \
                   (prev_repository == git_repo_dir) and \
                   (prev_extract_text == str(extract_text)):
                    try:
                        p_commits = set(x[0]
                            for x in con.execute("SELECT hash FROM commits").fetchall())
                    except sqlite3.OperationalError:
                        p_commits = set()
                    c_commits = set(c.hash
                        for c in pydriller.GitRepository(git_repo_dir).get_list_commits())
                    if not p_commits.issubset(c_commits):
                        raise Exception("Found a database that was created with identical " +
                                        "settings. However, some commits in the database are not " +
                                        "in the provided git repository. Please provide a clean " +
                                        "database.")
                    else:
                        if p_commits == c_commits:
                            print("The provided database is already complete!")
                            return
                        else:
                            print("Found a matching database on provided path. " +
                                    "Skipping {} ({:.2f}%) of {} commits. {} commits remaining."
                                    .format(len(p_commits), len(p_commits) / len(c_commits) * 100,
                                            len(c_commits), len(c_commits) - len(p_commits)))
                else:
                    raise Exception("Found a database on provided path that was created with " +
                                    "settings not matching the ones selected for the current " +
                                    "run. A path to either no database or a database from a  " +
                                    "previously paused run with identical settings is required.")
        except sqlite3.OperationalError:
            raise Exception("Found a database on provided path that was likely not created with " +
                            "git2net. A path to either no database or a database from a " +
                            "previously paused run with identical settings is required.")
    else:
        print("Found no database on provided path. Starting from scratch.")
        with sqlite3.connect(sqlite_db_file) as con:
            con.execute("""CREATE TABLE _metadata ('created with',
                                                   'repository',
                                                   'date',
                                                   'method',
                                                   'extract_text')""")
            con.execute("""INSERT INTO _metadata ('created with',
                                                  'repository',
                                                  'date',
                                                  'method',
                                                  'extract_text')
                        VALUES (:version,
                                :repository,
                                :date,
                                :method,
                                :extract_text)""",
                        {'version': 'git2net ' + str(__version__),
                         'repository': git_repo_dir,
                         'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                         'method': 'blocks' if use_blocks else 'lines',
                         'extract_text': str(extract_text)})
            con.commit()
            p_commits = []

    if not commits:
        u_commits = [c for c in git_repo.get_list_commits() if c.hash not in p_commits]
    else:
        c_commits = set(c.hash
                        for c in pydriller.GitRepository(git_repo_dir).get_list_commits())
        if not set(commits).issubset(c_commits):
            raise Exception("At least one provided commit does not exist in the repository.")
        commits = [git_repo.get_commit(h) for h in commits]
        u_commits = [c for c in commits if c.hash not in p_commits]

    if extraction_settings['no_of_processes'] > 1:
        _process_repo_parallel(git_repo_dir, sqlite_db_file, u_commits, extraction_settings)
    else:
        _process_repo_serial(git_repo_dir, sqlite_db_file, u_commits, extraction_settings)