#!/usr/bin/env python # import sys, os, argparse, logging, fnmatch, posixpath, socket import github if sys.version_info < (3, 0): # python 2 import urlparse from urllib import urlretrieve from urllib import quote else: # python 3 import urllib.parse as urlparse from urllib.request import urlretrieve from urllib.parse import quote def main(args, loglevel): logging.basicConfig(format="%(levelname)s: %(message)s", level=loglevel) socket.setdefaulttimeout(args.timeout) if args.username and args.password: g = github.Github(args.username, args.password) else: g = github.Github() if args.repo_file: repo_gen = file_repo_gen(args.repo_file, g) else: repo_gen = g.get_repos(since=args.last_repo) download_files(args, g, repo_gen) def file_repo_gen(repo_file, g): with open(repo_file, 'r') as f: for line in f: repo_str = line.rstrip().split('github.com/')[-1] yield g.get_repo(repo_str) def download_files(args, g, repo_gen): file_counter = 0 for repo in repo_gen: try: logging.info('Fetching repository: %s (id: %i)' % (repo.full_name, repo.id)) tree = repo.get_git_tree('master', recursive=True) files_to_download = [] for file in tree.tree: if fnmatch.fnmatch(file.path, args.wildcard): files_to_download.append('https://github.com/%s/raw/master/%s' % (repo.full_name, file.path)) for file in files_to_download: logging.info('Downloading %s' % file) file = quote(file) file_counter += 1 filename = posixpath.basename(urlparse.urlsplit(file).path) output_path = os.path.join(args.output_dir, filename) if os.path.exists(output_path): output_path += "-" + str(file_counter) try: urlretrieve(file, output_path) except Exception: logging.exception('Error downloading %s.' % file) except Exception: logging.exception('Error fetching repository.') args.yara_meta = os.path.join(args.output_dir, args.yara_meta) with open(args.yara_meta, 'w') as f: for i in os.listdir(args.output_dir): try: f.write("include \"" + i + "\"\n") except Exception: logging.exception('Couldn\'t write to %s.' % args.yara_meta) if __name__ == '__main__': parser = argparse.ArgumentParser(description = "Github file downloader") parser.add_argument("-u", "--username", default = None, help = "Username used to authenticate with github for increased rate limit") parser.add_argument("-p", "--password", default = None, help = "Password or token used to authenticate with github") parser.add_argument("-r", "--repo_file", help = "Path for the input file which contains a url of a Github repository for each separate line") parser.add_argument("-l", "--last_repo", type=int, default = github.GithubObject.NotSet, help = "When not using a repo_file, this will be used as starting position for github repo crawl") parser.add_argument("-w", "--wildcard", help = "Unix shell-style wildcard to match files to download (for example: *.txt)") parser.add_argument("-o", "--output_dir", default = "", help = "Directory to store all downloaded files") parser.add_argument("-y", "--yara-meta", default = "rules.yara", help = "Yara meta rule filename to create") parser.add_argument("-t", "--timeout", default = 30, help = "Socket timeout (seconds)") parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") args = parser.parse_args() # Setup logging if args.verbose: loglevel = logging.DEBUG else: loglevel = logging.INFO main(args, loglevel)