import validators from lxml import etree from requests import ConnectionError import re import logging logger = logging.getLogger(__name__) def validate_url(url): """ Validates the URL :param url: :return: """ if validators.url(url): return url elif validators.domain(url): return "http://{}".format(url) return "" def validate_repo_url(url): """ Validates and formats `url` to be valid URL pointing to a repo on bitbucket.org or github.com :param url: str, URL :return: str, valid URL if valid repo, emptry string otherwise """ try: if "github.com" in url: return re.findall(r"https?://w?w?w?.?github.com/[\w\-]+/[\w.-]+", url)[0] elif "bitbucket.org" in url: return re.findall(r"https?://bitbucket.org/[\w.-]+/[\w.-]+", url)[0] + "/src/" elif "launchpad.net" in url: return re.findall(r"https?://launchpad.net/[\w.-]+", url)[0] elif "sourceforge.net" in url: mo = re.match(r"https?://sourceforge.net/projects/" r"([\w.-]+)/", url, re.I) template = "https://sourceforge.net/p/{}/code/HEAD/tree/trunk/src/" return template.format(mo.groups()[0]) except (IndexError, AttributeError): pass return "" def contains_project_name(name, link): """ Checks if the given link `somewhat` contains the project name. :param name: str, project name :param link: str, link :return: bool, True if the link contains the project name """ def unclutter(string): # strip out all python references and remove all excessive characters string = string.lower().replace("_", "-").replace(".", "-") for replace in ["python-", "py-", "-py", "-python"]: string = string.replace(replace, "") return re.sub("[^0123456789 a-zA-Z]", "", string).strip() return unclutter(name) in unclutter(link) def find_repo_urls(session, name, candidates): """ Visits the given URL candidates and searches the page for valid links to a repository. :param session: requests Session instance :param name: str, project name :param candidates: list, list of URL candidates :return: str, URL to a repo """ for _url in candidates: if validate_url(_url): try: resp = session.get(_url) if resp.status_code == 200: tree = etree.HTML(resp.content) if tree: for link in frozenset([str(l) for l in tree.xpath("//a/@href")]): # check if the link 1) is to github.com / bitbucket.org AND 2) somewhat # contains the project name if ("github.com" in link or "bitbucket.org" in link or "sourceforge.net" in link) \ and contains_project_name(name, link): link = validate_url(validate_repo_url(url=link)) if link: logger.debug("Found repo URL {}".format(link)) yield link except ConnectionError: # we really don't care about connection errors here. a lot of project pages are simply # down because the project is no longer maintained pass except etree.XMLSyntaxError: # unable to parse HTML pass except UnicodeEncodeError: pass # changelogs come in all forms and colors. This set contains most of them, e.g. (HISTORY, history, # History.md, HISTORY.rst ... etc.) CHANGELOG_FILENAME_CANDIDATES = frozenset([ item for sublist in [ [f + e, f.upper() + e, f.capitalize() + e] for f in [ "history", "news", "releases", "release", "changes", "changelog", "log" ] for e in [ "", ".txt", ".md", ".rst", ".adoc" ] ] for item in sublist ] + ["ReleaseNotes.wiki"]) DOCS_CANDIDATES = frozenset([ "docs", "doc", "documentation", "docs-src", "wiki", "docs/", "doc/", "documentation/", "docs-src/", "wiki/" ]) def find_changelog(session, repo_url, deep=True): """ Tries to find changelogs on the given `repo_url`. :param session: requests Session instance :param repo_url: str, URL to the repo :param deep: bool, deep search :return: str, URL to the raw changelog content """ logger.debug("Trying to find changelog on repo {}".format(repo_url)) resp = session.get(repo_url) if resp.status_code == 200: # build up a list of URLs on this repo. xpath() isn't returning raw strings, so we have to # convert them first. We also need to strip out all GET parameters if any. tree = etree.HTML(resp.content) try: links = frozenset([str(l).split("?")[0] for l in tree.xpath("//a/@href")]) except UnicodeEncodeError: links = [] match, found = False, False for link in links: # we are going to check for valid changelog links on the root first. We do that by # checking if the link ends with one of out changelog filename candidates. for candidate in CHANGELOG_FILENAME_CANDIDATES: if link.endswith(candidate): if "github.com" in repo_url and "blob" in link: link = link.replace(repo_url, "") match = validate_url("https://raw.githubusercontent.com" + link.replace("/blob/", "/")) elif "bitbucket.org" in repo_url and "src" in link: match = validate_url("https://bitbucket.org" + link.replace("/src/", "/raw/")) elif "sourceforge.net" in repo_url: match = validate_url(repo_url + link + "?format=raw") if match: yield match match, found = False, True # if this is a deep search and we haven't found any changelogs on the repo root, we are # going to check every potential doc page. if deep and not found: for link in links: sublink = False for doc_candidate in DOCS_CANDIDATES: if link.endswith(doc_candidate): if "github.com" in repo_url and "tree" in link: if link.startswith("https://github.com"): sublink = link else: sublink = "https://github.com" + link elif "bitbucket.org" in repo_url and "src" in link: sublink = "https://bitbucket.org" + link # if we find a valid link to a doc subdirectory on the repo call this # function again and yield all possible changelog hits if sublink: for _url in find_changelog(session, sublink, deep=False): yield _url sublink = False def find_release_page(session, repo_url): if "github.com" in repo_url: logger.debug("Unable to find changelog on {}, try release page".format(repo_url)) try: username, reponame = repo_url.split("/")[3:5] # try to fetch the release page. if it 200s, yield the release page # api URL for further processing resp = session.get("https://github.com/{username}/{reponame}/releases".format( username=username, reponame=reponame )) if resp.status_code == 200: yield "https://api.github.com/repos/{username}/{reponame}/releases".format( username=username, reponame=reponame ) except IndexError: logger.debug("Unable to construct releases url for {}".format(repo_url)) def filter_repo_urls(candidates): """ Filters down a list of URL candidates :param candidates: list, URL candidates :return: set, Repo URLs """ # first, we are going to filter down the URL candidates to be all valid urls candidates = set(url for url in [validate_url(_url) for _url in candidates] if url) logger.info("Got repo candidates {}".format(candidates)) repos = set(url for url in [validate_repo_url(_url) for _url in candidates] if url) logger.info("Filtered initial candidates down to {}".format(repos)) return repos def find_changelogs(session, name, candidates): """ Tries to find changelogs on the given URL candidates :param session: requests Session instance :param name: str, project name :param candidates: list, URL candidates :return: tuple, (set(changelog URLs), set(repo URLs)) """ repos = filter_repo_urls(candidates=candidates) # if we are lucky and there isn't a valid repo URL in our URL candidates, we need to go deeper # and check the URLs if they contain a link to a repo if not repos: logger.info("No repo found, trying to find one on related sites {}".format(candidates)) repos = set(find_repo_urls(session, name, candidates)) urls = [] for repo in repos: for url in find_changelog(session, repo): if not contains_project_name(name, url): logger.debug("Found changelog on {url}, but it does not contain the project name " "{name}, ""aborting".format(name=name, url=url)) continue urls.append(url) if not urls: # at this point we failed to fetch a changelog from plain files. we might find one on the # github release page. logger.debug("No plain changelog urls found, trying release page") for repo in repos: # make sure the link to the release page contains the project name if contains_project_name(name, repo): for url in find_release_page(session, repo): urls.append(url) return set(urls), repos def find_git_repo(session, name, candidates): """ Tries to find git repos on the given URL candidates :param session: requests Session instance :param name: str, project name :param candidates: list, URL candidates :return: tuple, (set(git URLs), set(repo URLs)) """ repos = filter_repo_urls(candidates=candidates) # if we are lucky and there isn't a valid repo URL in our URL candidates, we need to go deeper # and check the URLs if they contain a link to a repo if not repos: logger.info("No repo found, trying to find one on related sites {}".format(candidates)) repos = set(find_repo_urls(session, name, candidates)) urls = [] for repo in repos: username, reponame = repo.split("/")[3:5] if "github.com" in repo: urls.append( "https://github.com/{username}/{reponame}.git".format( username=username, reponame=reponame ) ) elif "bitbucket.org" in repo: urls.append( "https://bitbucket.org/{username}/{reponame}".format( username=username, reponame=reponame ) ) return set(urls), repos