#from urlextract import URLExtract import re import lib.db import lib.utils import urlparse import requests #TODO: Add this when i move project to python3 # def extract_urls_urlextractor(tool_output): # #print(type(tool_output)) # extractor = URLExtract() # urls = extractor.find_urls(tool_output) # #print(urls) # #for url in extractor.find_urls(tool_output): # #print("* " + urls) # #print(type(url)) # #print(urls) # return urls def extract_urls_regex(tool_output): intereseting_urls = [] urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tool_output) not_interesting_extensions = [".png", ".ico", ".js", ".css", ".woff2", ".ttf", ".jpg", ".jpeg", ".svg", ".eot", ".woff",".gif"] for url in urls: if not url.endswith(tuple(not_interesting_extensions)): intereseting_urls.append(url) return intereseting_urls def extract_urls(tool_output): #TODO: Add uncomment these three lines and comment out the forth when i move project to python3 #a = extract_urls_urlextractor(tool_output) #b = extract_urls_regex(tool_output) #urls = list(set().union(a, b)) urls = extract_urls_regex(tool_output) return urls def is_url_in_scope(url): workspace = lib.db.get_current_workspace()[0][0] try: parsed_url = urlparse.urlparse(url) scheme = parsed_url[0] if ":" in parsed_url[1]: vhost, port = parsed_url[1].split(':') else: vhost = parsed_url[1] if scheme == "http": port = 80 elif scheme == "https": port = 443 path = parsed_url[2].replace("//", "/") except: print("error parsing url") if not scheme: pass in_scope = lib.db.is_vhost_in_db(vhost,workspace) if in_scope: return str(True),vhost,port,url.rstrip("/"),workspace else: return str(False) def insert_url_into_db(vhost,port,url,url_status, workspace): db_path = (vhost, port, url, url_status, 0, "", workspace) lib.db.insert_new_path(db_path) def extract_in_scope_urls_from_task_output(tool_output): urls = extract_urls(tool_output) valid_url_count = 0 for url in urls: is_in_scope,vhost,port,url,workspace = is_url_in_scope(url) if is_in_scope == "True": url_status = check_if_page_exists(url) print(url,url_status) if url_status != 999: insert_url_into_db(vhost, port, url, url_status, workspace) valid_url_count += 1 return valid_url_count def check_if_page_exists(url): try: response = requests.head(url, timeout=5, verify=False) status_code = response.status_code except requests.exceptions.ConnectionError: status_code = 999 return status_code