# -*- coding: utf-8 -*- # # Copyright (C) 2015-2019 Bitergia # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # Authors: # Alvaro del Castillo San Felix <acs@bitergia.com> # Georg Link <georglink@bitergia.com> # Quan`Zhou <quan@bitergia.com> # import json import logging import re import sys import pkg_resources import requests from elasticsearch import Elasticsearch, RequestsHttpConnection from grimoirelab_toolkit.datetime import (datetime_to_utc, str_to_datetime, datetime_utcnow) from perceval.backends.core.git import (GitCommand, GitRepository, EmptyRepositoryError, RepositoryError) from .enrich import Enrich, metadata from .study_ceres_aoc import areas_of_code, ESPandasConnector from ..elastic_mapping import Mapping as BaseMapping from ..elastic_items import HEADER_JSON, MAX_BULK_UPDATE_SIZE from .utils import anonymize_url GITHUB = 'https://github.com/' DEMOGRAPHY_COMMIT_MIN_DATE = '1980-01-01' AREAS_OF_CODE_ALIAS = 'git_areas_of_code' logger = logging.getLogger(__name__) class Mapping(BaseMapping): @staticmethod def get_elastic_mappings(es_major): """Get Elasticsearch mapping. :param es_major: major version of Elasticsearch, as string :returns: dictionary with a key, 'items', with the mapping """ mapping = """ { "properties": { "message_analyzed": { "type": "text", "index": true } } }""" return {"items": mapping} class GitEnrich(Enrich): mapping = Mapping # REGEX to extract authors from a multi author commit: several authors present # in the Author field in the commit. Used if self.pair_programming is True AUTHOR_P2P_REGEX = re.compile(r'(?P<first_authors>.* .*) and (?P<last_author>.* .*) (?P<email>.*)') AUTHOR_P2P_NEW_REGEX = re.compile(r"Co-authored-by:(?P<first_authors>.* .*)<(?P<email>.*)>\n?") GIT_AOC_ENRICHED = "git_aoc-enriched" roles = ['Author', 'Commit'] def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, db_user='', db_password='', db_host='', pair_programming=False): super().__init__(db_sortinghat, db_projects_map, json_projects_map, db_user, db_password, db_host) self.studies = [] self.studies.append(self.enrich_demography) self.studies.append(self.enrich_areas_of_code) self.studies.append(self.enrich_onion) self.studies.append(self.enrich_git_branches) self.studies.append(self.enrich_forecast_activity) self.studies.append(self.enrich_extra_data) self.rate_limit = None self.rate_limit_reset_ts = None self.min_rate_to_sleep = 100 # if pending rate < 100 sleep self.pair_programming = pair_programming def get_field_author(self): return "Author" def get_field_unique_id(self): # In pair_programming the uuid is not unique: a commit can create # several commits if self.pair_programming: return "git_uuid" return "uuid" def get_field_date(self): """ Field with the date in the JSON enriched items """ return "grimoire_creation_date" def __get_authors(self, authors_str): # Extract the authors from a multiauthor m = self.AUTHOR_P2P_REGEX.match(authors_str) if m: authors = m.group('first_authors').split(",") authors = [author.strip() for author in authors] authors += [m.group('last_author')] n = self.AUTHOR_P2P_NEW_REGEX.findall(authors_str) if n: for i in n: authors += [i[0]] # Remove duplicates authors = list(set(authors)) return authors def get_identities(self, item): """Return the identities from an item.""" if item['data']['Author']: # Check multi authors commits m = self.AUTHOR_P2P_REGEX.match(item['data']["Author"]) n = self.AUTHOR_P2P_NEW_REGEX.match(item['data']["Author"]) if (m or n) and self.pair_programming: authors = self.__get_authors(item['data']["Author"]) for author in authors: user = self.get_sh_identity(author) yield user else: user = self.get_sh_identity(item['data']["Author"]) yield user if item['data']['Commit']: m = self.AUTHOR_P2P_REGEX.match(item['data']["Commit"]) n = self.AUTHOR_P2P_NEW_REGEX.match(item['data']["Author"]) if (m or n) and self.pair_programming: committers = self.__get_authors(item['data']['Commit']) for committer in committers: user = self.get_sh_identity(committer) yield user else: user = self.get_sh_identity(item['data']['Commit']) yield user if 'Signed-off-by' in item['data'] and self.pair_programming: signers = item['data']["Signed-off-by"] for signer in signers: user = self.get_sh_identity(signer) yield user def get_sh_identity(self, item, identity_field=None): # John Smith <john.smith@bitergia.com> identity = {} git_user = item # by default a specific user dict is expected if isinstance(item, dict) and 'data' in item: git_user = item['data'][identity_field] fields = git_user.split("<") name = fields[0] name = name.strip() # Remove space between user and email email = None if len(fields) > 1: email = git_user.split("<")[1][:-1] identity['username'] = None identity['email'] = email identity['name'] = name return identity def get_project_repository(self, eitem): return eitem['origin'] @metadata def get_rich_item(self, item): eitem = {} self.copy_raw_fields(self.RAW_FIELDS_COPY, item, eitem) # For pair programming uuid is not a unique field. Use git_uuid in general as unique field. eitem['git_uuid'] = eitem['uuid'] # The real data commit = item['data'] self.__fix_field_date(commit, 'AuthorDate') self.__fix_field_date(commit, 'CommitDate') # data fields to copy copy_fields = ["message"] for f in copy_fields: if f in commit: eitem[f] = commit[f] else: eitem[f] = None # Fields which names are translated map_fields = {"commit": "hash", "message": "message_analyzed"} for fn in map_fields: if fn in commit: eitem[map_fields[fn]] = commit[fn] else: eitem[map_fields[fn]] = None if 'message' in commit: eitem['message'] = commit['message'][:self.KEYWORD_MAX_LENGTH] if 'refs' in commit: eitem["commit_tags"] = list(filter(lambda r: "tag: " in r, commit['refs'])) eitem['hash_short'] = eitem['hash'][0:6] # Enrich dates author_date = str_to_datetime(commit["AuthorDate"]) commit_date = str_to_datetime(commit["CommitDate"]) eitem["author_date"] = author_date.replace(tzinfo=None).isoformat() eitem["commit_date"] = commit_date.replace(tzinfo=None).isoformat() eitem["author_date_weekday"] = author_date.replace(tzinfo=None).isoweekday() eitem["author_date_hour"] = author_date.replace(tzinfo=None).hour eitem["commit_date_weekday"] = commit_date.replace(tzinfo=None).isoweekday() eitem["commit_date_hour"] = commit_date.replace(tzinfo=None).hour utc_author_date = datetime_to_utc(author_date) utc_commit_date = datetime_to_utc(commit_date) eitem["utc_author"] = utc_author_date.replace(tzinfo=None).isoformat() eitem["utc_commit"] = utc_commit_date.replace(tzinfo=None).isoformat() eitem["utc_author_date_weekday"] = utc_author_date.replace(tzinfo=None).isoweekday() eitem["utc_author_date_hour"] = utc_author_date.replace(tzinfo=None).hour eitem["utc_commit_date_weekday"] = utc_commit_date.replace(tzinfo=None).isoweekday() eitem["utc_commit_date_hour"] = utc_commit_date.replace(tzinfo=None).hour eitem["tz"] = int(author_date.strftime("%z")[0:3]) eitem["branches"] = [] # Compute time to commit time_to_commit_delta = datetime_to_utc(author_date) - datetime_to_utc(commit_date) eitem["time_to_commit_hours"] = round(time_to_commit_delta.seconds / 3600, 2) # Other enrichment eitem["repo_name"] = item["origin"] if eitem["repo_name"].startswith('http'): eitem["repo_name"] = anonymize_url(eitem["repo_name"]) # Number of files touched eitem["files"] = 0 # Number of lines added and removed lines_added = 0 lines_removed = 0 for cfile in commit["files"]: if 'action' not in cfile: # merges are not counted continue eitem["files"] += 1 if 'added' in cfile and 'removed' in cfile: try: lines_added += int(cfile["added"]) lines_removed += int(cfile["removed"]) except ValueError: # logger.warning(cfile) continue eitem["lines_added"] = lines_added eitem["lines_removed"] = lines_removed eitem["lines_changed"] = lines_added + lines_removed # author_name and author_domain are added always identity = self.get_sh_identity(commit["Author"]) eitem["author_name"] = identity['name'] eitem["author_domain"] = self.get_identity_domain(identity) # committer data identity = self.get_sh_identity(commit["Commit"]) eitem["committer_name"] = identity['name'] eitem["committer_domain"] = self.get_identity_domain(identity) # title from first line if 'message' in commit: eitem["title"] = commit['message'].split('\n')[0] else: eitem["title"] = None # If it is a github repo, include just the repo string if GITHUB in item['origin']: eitem['github_repo'] = item['origin'].replace(GITHUB, '') eitem['github_repo'] = re.sub('.git$', '', eitem['github_repo']) eitem["url_id"] = eitem['github_repo'] + "/commit/" + eitem['hash'] if 'project' in item: eitem['project'] = item['project'] # Adding the git author domain author_domain = self.get_identity_domain(self.get_sh_identity(item, 'Author')) eitem['git_author_domain'] = author_domain eitem.update(self.get_grimoire_fields(commit["AuthorDate"], "commit")) # grimoire_creation_date is needed in the item item.update(self.get_grimoire_fields(commit["AuthorDate"], "commit")) eitem.update(self.get_item_sh(item, self.roles)) if self.prjs_map: eitem.update(self.get_item_project(eitem)) if self.pair_programming: eitem = self.__add_pair_programming_metrics(commit, eitem) self.add_repository_labels(eitem) self.add_metadata_filter_raw(eitem) return eitem def __fix_field_date(self, item, attribute): """Fix possible errors in the field date""" field_date = str_to_datetime(item[attribute]) try: _ = int(field_date.strftime("%z")[0:3]) except ValueError: logger.warning("[git] {} in commit {} has a wrong format".format( attribute, item['commit'])) item[attribute] = field_date.replace(tzinfo=None).isoformat() def __add_pair_programming_metrics(self, commit, eitem): def get_pair_programming_metrics(eitem, nauthors): ndecimals = 2 metrics = {} files = eitem['files'] ladded = eitem['lines_added'] lremoved = eitem['lines_removed'] lchanged = eitem['lines_changed'] metrics['pair_programming_commit'] = round(1.0 / nauthors, ndecimals) metrics['pair_programming_files'] = round(files / nauthors, ndecimals) metrics["pair_programming_lines_added"] = round(ladded / nauthors, ndecimals) metrics["pair_programming_lines_removed"] = round(lremoved / nauthors, ndecimals) metrics["pair_programming_lines_changed"] = round(lchanged / nauthors, ndecimals) return metrics # Include pair programming metrics in all cases. In general, 1 author. eitem.update(get_pair_programming_metrics(eitem, 1)) # Multi author support eitem['is_git_commit_multi_author'] = 0 if 'is_git_commit_multi_author' in commit: eitem['is_git_commit_multi_author'] = commit['is_git_commit_multi_author'] if 'authors' in commit: eitem['authors'] = commit['authors'] nauthors = len(commit['authors']) eitem.update(get_pair_programming_metrics(eitem, nauthors)) # Pair Programming support using Signed-off eitem['Signed-off-by_number'] = 0 eitem['is_git_commit_signed_off'] = 0 if 'Signed-off-by' in commit: eitem['Signed-off-by'] = commit['Signed-off-by'] eitem['Signed-off-by_number'] = len(commit['Signed-off-by']) if 'is_git_commit_signed_off' in commit: # Commits generated for signed_off people eitem['is_git_commit_signed_off'] = commit['is_git_commit_signed_off'] # The commit for the original Author also needs this data eitem['authors_signed_off'] = commit['authors_signed_off'] nauthors = len(commit['authors_signed_off']) eitem.update(get_pair_programming_metrics(eitem, nauthors)) return eitem def enrich_items(self, ocean_backend, events=False): """ Implementation supporting signed-off and multiauthor/committer commits. Multiauthor/Multcommiter commits are the ones authored/commited by more than one users, in such a case, ELK first extracts authors names from raw data through regex, for instance in, ... "data": { "Author": "Eduardo Morais and Zhongpeng Lin <companheiro.vermelho@gmail.com>", "AuthorDate": "Tue Aug 14 14:32:15 2012 -0300", "Commit": "Eduardo Morais and Zhongpeng Lin <companheiro.vermelho@gmail.com>", "CommitDate": "Tue Aug 14 14:32:15 2012 -0300", "Signed-off-by": [ "Eduardo Morais <companheiro.vermelho@gmail.com>" ], "commit": "87783129c3f00d2c81a3a8e585eb86a47e39891a", ... authors extracted are ['Eduardo Morais', 'Zhongpeng Lin'] (can be more than 2). A new rich item is now created using each author name. For multicommitter only the first committer name is used to create the rich item. In case the commit is signed-off by committer, raw data has extra "Signed-off-by" attribute used to create a new rich item for every author who signed off the commit, making sure duplicate entries are not created. Note - "message": "Enable users to pass flags\n\nCo-authored-by: mariiapunda <mariiapunda@users.noreply.github.com>", Co-authored commits like these are not considered as multiauthored commits in ELK. """ headers = {"Content-Type": "application/json"} max_items = self.elastic.max_items_bulk current = 0 total = 0 bulk_json = "" total_signed_off = 0 total_multi_author = 0 url = self.elastic.get_bulk_url() logger.debug("[git] Adding items to {} (in {} packs)".format(anonymize_url(url), max_items)) items = ocean_backend.fetch() for item in items: if self.pair_programming: # First we need to add the authors field to all commits # Check multi author m = self.AUTHOR_P2P_REGEX.match(item['data']['Author']) n = self.AUTHOR_P2P_NEW_REGEX.match(item['data']['Author']) if m or n: logger.debug("[git] Multiauthor detected. Creating one commit " "per author: {}".format(item['data']['Author'])) item['data']['authors'] = self.__get_authors(item['data']['Author']) item['data']['Author'] = item['data']['authors'][0] m = self.AUTHOR_P2P_REGEX.match(item['data']['Commit']) n = self.AUTHOR_P2P_NEW_REGEX.match(item['data']['Author']) if m or n: logger.debug("[git] Multicommitter detected: using just the first committer") item['data']['committers'] = self.__get_authors(item['data']['Commit']) item['data']['Commit'] = item['data']['committers'][0] # Add the authors list using the original Author and the Signed-off list if 'Signed-off-by' in item['data']: authors_all = item['data']['Signed-off-by'] + [item['data']['Author']] item['data']['authors_signed_off'] = list(set(authors_all)) if current >= max_items: try: total += self.elastic.safe_put_bulk(url, bulk_json) json_size = sys.getsizeof(bulk_json) / (1024 * 1024) logger.debug("[git] Added {} items to {} ({:.2f} MB)".format( total, anonymize_url(url), json_size)) except UnicodeEncodeError: # Why is requests encoding the POST data as ascii? logger.warning("[git] Unicode error in enriched items, converting to ascii") safe_json = str(bulk_json.encode('ascii', 'ignore'), 'ascii') total += self.elastic.safe_put_bulk(url, safe_json) bulk_json = "" current = 0 rich_item = self.get_rich_item(item) data_json = json.dumps(rich_item) unique_field = self.get_field_unique_id() bulk_json += '{"index" : {"_id" : "%s" } }\n' % (rich_item[unique_field]) bulk_json += data_json + "\n" # Bulk document current += 1 if self.pair_programming: # Multi author support if 'authors' in item['data']: # First author already added in the above commit authors = item['data']['authors'] for i in range(1, len(authors)): # logger.debug('Adding a new commit for %s', authors[i]) item['data']['Author'] = authors[i] item['data']['is_git_commit_multi_author'] = 1 rich_item = self.get_rich_item(item) item['data']['is_git_commit_multi_author'] = 1 data_json = json.dumps(rich_item) commit_id = item["uuid"] + "_" + str(i - 1) rich_item['git_uuid'] = commit_id bulk_json += '{"index" : {"_id" : "%s" } }\n' % rich_item['git_uuid'] bulk_json += data_json + "\n" # Bulk document current += 1 total_multi_author += 1 if rich_item['Signed-off-by_number'] > 0: nsg = 0 # Remove duplicates and the already added Author if exists authors = list(set(item['data']['Signed-off-by'])) if item['data']['Author'] in authors: authors.remove(item['data']['Author']) for author in authors: # logger.debug('Adding a new commit for %s', author) # Change the Author in the original commit and generate # a new enriched item with it item['data']['Author'] = author item['data']['is_git_commit_signed_off'] = 1 rich_item = self.get_rich_item(item) commit_id = item["uuid"] + "_" + str(nsg) rich_item['git_uuid'] = commit_id data_json = json.dumps(rich_item) bulk_json += '{"index" : {"_id" : "%s" } }\n' % rich_item['git_uuid'] bulk_json += data_json + "\n" # Bulk document current += 1 total_signed_off += 1 nsg += 1 if current > 0: total += self.elastic.safe_put_bulk(url, bulk_json) if total == 0: # No items enriched, nothing to upload to ES return total if self.pair_programming: logger.info("[git] Signed-off commits generated: {}".format(total_signed_off)) logger.info("[git] Multi author commits generated: {}".format(total_multi_author)) return total def enrich_demography(self, ocean_backend, enrich_backend, date_field="grimoire_creation_date", author_field="author_uuid"): super().enrich_demography(ocean_backend, enrich_backend, date_field, author_field=author_field) def enrich_areas_of_code(self, ocean_backend, enrich_backend, no_incremental=False, in_index="git-raw", out_index=GIT_AOC_ENRICHED, sort_on_field='metadata__timestamp'): log_prefix = "[git] study areas_of_code" logger.info("{} Starting study - Input: {} Output: {}".format(log_prefix, in_index, out_index)) # Creating connections es_in = Elasticsearch([ocean_backend.elastic.url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) es_out = Elasticsearch([enrich_backend.elastic.url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_conn = ESPandasConnector(es_conn=es_in, es_index=in_index, sort_on_field=sort_on_field) out_conn = ESPandasConnector(es_conn=es_out, es_index=out_index, sort_on_field=sort_on_field, read_only=False) exists_index = out_conn.exists() if no_incremental or not exists_index: logger.info("{} Creating out ES index".format(log_prefix)) # Initialize out index if self.elastic.major == '7': filename = pkg_resources.resource_filename('grimoire_elk', 'enriched/mappings/git_aoc_es7.json') else: filename = pkg_resources.resource_filename('grimoire_elk', 'enriched/mappings/git_aoc.json') out_conn.create_index(filename, delete=exists_index) repos = [] for source in self.json_projects.values(): items = source.get('git') if items: repos.extend(items) for repo in repos: anonymize_repo = anonymize_url(repo) logger.info("{} Processing repo: {}".format(log_prefix, anonymize_repo)) in_conn.update_repo(anonymize_repo) out_conn.update_repo(anonymize_repo) areas_of_code(git_enrich=enrich_backend, in_conn=in_conn, out_conn=out_conn) # delete the documents in the AOC index which correspond to commits that don't exist in the raw index if out_conn.exists(): self.update_items_aoc(ocean_backend, es_out, out_index, anonymize_repo) # Create alias if output index exists and alias does not if out_conn.exists(): if not out_conn.exists_alias(AREAS_OF_CODE_ALIAS) \ and not enrich_backend.elastic.alias_in_use(AREAS_OF_CODE_ALIAS): logger.info("{} creating alias: {}".format(log_prefix, AREAS_OF_CODE_ALIAS)) out_conn.create_alias(AREAS_OF_CODE_ALIAS) else: logger.warning("{} alias already exists: {}.".format(log_prefix, AREAS_OF_CODE_ALIAS)) logger.info("{} end".format(log_prefix)) def get_unique_hashes_aoc(self, es_aoc, index_aoc, repository): """Retrieve the unique commit hashes in the AOC index :param es_aoc: the ES object to access AOC data :param index_aoc: the AOC index :param repository: the target repository """ def __unique_commit_hashes_aoc(repository, until_date=None): """Retrieve all unique commit hashes in ascending order on grimoire_creation_date for a given repository in the AOC index""" fltr = [ { "term": { "repository": repository } } ] if until_date: fltr.append({ "range": { "metadata__updated_on": { "gte": until_date } } }) query_unique_hashes = """ { "aggs": { "2": { "terms": { "field": "hash", "size": 1000, "order": { "1": "asc" } }, "aggs": { "1": { "max": { "field": "grimoire_creation_date" } } } } }, "size": 0, "query": { "bool": { "filter": %s } } } """ % json.dumps(fltr) return query_unique_hashes aoc_hashes = [] fetching = True last_date = None previous_date = None while fetching: hits = es_aoc.search(index=index_aoc, body=__unique_commit_hashes_aoc(repository, last_date)) buckets = hits['aggregations']['2']['buckets'] if not buckets: fetching = False for bucket in buckets: aoc_hashes.append(bucket['key']) last_date = bucket['1']['value_as_string'] if previous_date == last_date: fetching = False previous_date = last_date return aoc_hashes def get_diff_commits_raw_aoc(self, ocean_backend, es_aoc, index_aoc, repository): """Return the commit hashes which are stored in the AOC index but not in the Git raw index. :param ocean_backend: Ocean backend :param es_aoc: the ES object to access AOC data :param index_aoc: the AOC index :param repository: the target repository """ fltr = { 'name': 'origin', 'value': [repository] } raw_hashes = set([item['data']['commit'] for item in ocean_backend.fetch(ignore_incremental=True, _filter=fltr)]) aoc_hashes = set(self.get_unique_hashes_aoc(es_aoc, index_aoc, repository)) hashes_to_delete = list(aoc_hashes.difference(raw_hashes)) return hashes_to_delete def update_items_aoc(self, ocean_backend, es_aoc, index_aoc, repository): """Update the documents stored in the AOC index by deleting those ones corresponding to deleted commits :param ocean_backend: the Ocean backend to access the raw data :param es_aoc: the ES object to access AOC data :param index_aoc: the AOC index :param repository: the target repository """ aoc_index_url = self.elastic_url + '/' + index_aoc hashes_to_delete = self.get_diff_commits_raw_aoc(ocean_backend, es_aoc, index_aoc, repository) to_process = [] for _hash in hashes_to_delete: to_process.append(_hash) if len(to_process) != MAX_BULK_UPDATE_SIZE: continue # delete documents from the AOC index self.remove_commits(to_process, aoc_index_url, 'hash', repository) to_process = [] if to_process: # delete documents from the AOC index self.remove_commits(to_process, aoc_index_url, 'hash', repository) logger.debug("[git] study areas_of_code {} commits deleted from {} with origin {}.".format( len(hashes_to_delete), anonymize_url(aoc_index_url), repository)) def enrich_onion(self, ocean_backend, enrich_backend, no_incremental=False, in_index='git_onion-src', out_index='git_onion-enriched', data_source='git', contribs_field='hash', timeframe_field='grimoire_creation_date', sort_on_field='metadata__timestamp', seconds=Enrich.ONION_INTERVAL): super().enrich_onion(enrich_backend=enrich_backend, in_index=in_index, out_index=out_index, data_source=data_source, contribs_field=contribs_field, timeframe_field=timeframe_field, sort_on_field=sort_on_field, no_incremental=no_incremental, seconds=seconds) def get_diff_commits_origin_raw(self, ocean_backend): """Return the commit hashes which are stored in the raw index but not in the original repo. :param ocean_backend: Ocean backend """ repo_origin = anonymize_url(self.perceval_backend.origin) fltr = { 'name': 'origin', 'value': [repo_origin] } current_hashes = [] try: git_repo = GitRepository(self.perceval_backend.uri, self.perceval_backend.gitpath) current_hashes = [commit for commit in git_repo.rev_list()] except EmptyRepositoryError: logger.warning("No commits retrieved from {}, repo is empty".format(repo_origin)) except RepositoryError: logger.warning("No commits retrieved from {}, repo doesn't exist locally".format(repo_origin)) except Exception as e: logger.error("[git] No commits retrieved from {}, git rev-list command failed: {}".format(repo_origin, e)) if not current_hashes: return current_hashes current_hashes = set(current_hashes) raw_hashes = set([item['data']['commit'] for item in ocean_backend.fetch(ignore_incremental=True, _filter=fltr)]) hashes_to_delete = list(raw_hashes.difference(current_hashes)) return hashes_to_delete def update_items(self, ocean_backend, enrich_backend): """Retrieve the commits not present in the original repository and delete the corresponding documents from the raw and enriched indexes""" repo_origin = anonymize_url(self.perceval_backend.origin) logger.debug("[git] update-items Checking commits for {}.".format(repo_origin)) hashes_to_delete = self.get_diff_commits_origin_raw(ocean_backend) to_process = [] for _hash in hashes_to_delete: to_process.append(_hash) if len(to_process) != MAX_BULK_UPDATE_SIZE: continue # delete documents from the raw index self.remove_commits(to_process, ocean_backend.elastic.index_url, 'data.commit', repo_origin) # delete documents from the enriched index self.remove_commits(to_process, enrich_backend.elastic.index_url, 'hash', repo_origin) to_process = [] if to_process: # delete documents from the raw index self.remove_commits(to_process, ocean_backend.elastic.index_url, 'data.commit', repo_origin) # delete documents from the enriched index self.remove_commits(to_process, enrich_backend.elastic.index_url, 'hash', repo_origin) logger.debug("[git] update-items {} commits deleted from {} with origin {}.".format( len(hashes_to_delete), anonymize_url(ocean_backend.elastic.index_url), repo_origin)) logger.debug("[git] update-items {} commits deleted from {} with origin {}.".format( len(hashes_to_delete), anonymize_url(enrich_backend.elastic.index_url), repo_origin)) def remove_commits(self, items, index, attr, origin, origin_attr='origin'): """Delete documents that correspond to commits deleted in the Git repository :param items: target items to be deleted :param index: target index :param attr: name of the term attribute to search items :param origin: name of the origin from where the items must be deleted :param origin_attr: attribute where the origin info is stored. """ es_query = ''' { "query": { "bool": { "must": { "term": { "%s": "%s" } }, "filter": { "terms": { "%s": [%s] } } } } } ''' % (origin_attr, origin, attr, ",".join(['"%s"' % i for i in items])) r = self.requests.post(index + "/_delete_by_query?refresh", data=es_query, headers=HEADER_JSON, verify=False) try: r.raise_for_status() except requests.exceptions.HTTPError as ex: logger.error("[git] Error updating deleted commits for {}.".format(anonymize_url(index))) logger.error(r.text) return def enrich_git_branches(self, ocean_backend, enrich_backend, run_month_days=[7, 14, 21, 28]): """Update the information about branches within the documents representing commits in the enriched index. The example below shows how to activate the study by modifying the setup.cfg. The study `enrich_git_branches` will be run on days depending on the parameter `run_month_days`, by default the days are 7, 14, 21, and 28 of each month. ``` [git] raw_index = git_raw enriched_index = git_enriched ... studies = [enrich_git_branches] [enrich_git_branches] run_month_days = [5, 22] ``` :param ocean_backend: the ocean backend :param enrich_backend: the enrich backend :param run_month_days: days of the month to run this study """ logger.debug("[git] study git-branches start") day = datetime_utcnow().day run_month_days = list(map(int, run_month_days)) if day not in run_month_days: logger.debug("[git] study git-branches will execute only the days {} of each month".format(run_month_days)) logger.debug("[git] study git-branches end") return for ds in self.prjs_map: if ds != "git": continue urls = self.prjs_map[ds] for url in urls: cmd = GitCommand(*[url]) git_repo = GitRepository(cmd.parsed_args.uri, cmd.parsed_args.gitpath) logger.debug("[git] study git-branches delete branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) self.delete_commit_branches(git_repo, enrich_backend) logger.debug("[git] study git-branches add branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) try: self.add_commit_branches(git_repo, enrich_backend) except Exception as e: logger.error("[git] study git-branches failed on repo {}, due to {}".format(git_repo.uri, e)) continue logger.debug("[git] study git-branches repo {} in index {} processed".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) logger.debug("[git] study git-branches end") def delete_commit_branches(self, git_repo, enrich_backend): """Delete the information about branches from the documents representing commits in the enriched index. :param git_repo: GitRepository object :param enrich_backend: the enrich backend """ fltr = """ "filter": [ { "term": { "origin": "%s" } } ] """ % anonymize_url(git_repo.uri) # reset references in enrich index es_query = """ { "script": { "source": "ctx._source.branches = new HashSet();", "lang": "painless" }, "query": { "bool": { %s } } } """ % fltr index = enrich_backend.elastic.index_url r = self.requests.post(index + "/_update_by_query?refresh", data=es_query, headers=HEADER_JSON, verify=False) try: r.raise_for_status() except requests.exceptions.HTTPError: logger.error("[git] Error while deleting branches on {}".format(anonymize_url(index))) logger.error(r.text) return logger.debug("[git] Delete branches {}, index {}".format(r.text, anonymize_url(index))) def add_commit_branches(self, git_repo, enrich_backend): """Add the information about branches to the documents representing commits in the enriched index. Branches are obtained using the command `git ls-remote`, then for each branch, the list of commits is retrieved via the command `git rev-list branch-name` and used to update the corresponding items in the enriched index. :param git_repo: GitRepository object :param enrich_backend: the enrich backend """ to_process = [] for hash, refname in git_repo._discover_refs(remote=True): if not refname.startswith('refs/heads/'): continue commit_count = 0 branch_name = refname.replace('refs/heads/', '') try: commits = git_repo.rev_list([branch_name]) for commit in commits: to_process.append(commit) commit_count += 1 if commit_count == MAX_BULK_UPDATE_SIZE: self.__process_commits_in_branch(enrich_backend, git_repo.uri, branch_name, to_process) # reset the counter to_process = [] commit_count = 0 if commit_count: self.__process_commits_in_branch(enrich_backend, git_repo.uri, branch_name, to_process) except Exception as e: logger.error("[git] Skip adding branch info for repo {} due to {}".format(git_repo.uri, e)) return def __process_commits_in_branch(self, enrich_backend, repo_origin, branch_name, commits): commits_str = ",".join(['"%s"' % c for c in commits]) # process branch names which include quotes or single quote digested_branch_name = branch_name if "'" in branch_name: digested_branch_name = branch_name.replace("'", "---") logger.warning("[git] Change branch name from {} to {}".format(branch_name, digested_branch_name)) if '"' in branch_name: digested_branch_name = branch_name.replace('"', "---") logger.warning("[git] Change branch name from {} to {}".format(branch_name, digested_branch_name)) # update enrich index fltr = self.__prepare_filter("hash", commits_str, anonymize_url(repo_origin)) es_query = """ { "script": { "source": "if(!ctx._source.branches.contains(params.branch)){ctx._source.branches.add(params.branch);}", "lang": "painless", "params": { "branch": "'%s'" } }, "query": { "bool": { %s } } } """ % (digested_branch_name, fltr) index = enrich_backend.elastic.index_url r = self.requests.post(index + "/_update_by_query?refresh", data=es_query, headers=HEADER_JSON, verify=False) try: r.raise_for_status() except requests.exceptions.HTTPError: logger.error("[git] Error adding branch info for {}".format(anonymize_url(index))) logger.error(r.text) return logger.debug("[git] Add branches {}, index {}".format(r.text, anonymize_url(index))) def __prepare_filter(self, terms_attr, terms_value, repo_origin): fltr = """ "filter": [ { "terms": { "%s": [%s] } }, { "term": { "origin": "%s" } } ] """ % (terms_attr, terms_value, repo_origin) return fltr