python source code of Page

import requests
from bs4 import BeautifulSoup
import validators
import time
from scrapeasy.WebData import OnlineData

#Abstract page class with base functionality
class abstractPage(object):
    def __init__(self, url, verify=True):

        # Define verify behaviour and extract domain from url
        self._verify = verify
        url = url.replace("%2F", "/")
        self._domain = self.findDomain(url)

        # Normalize URL to not contain anything before the domain / subdomain
        try:
            self._url = url[url.index(self._domain):]
        except ValueError as ve:
            self._url = url
        if not validators.url("http://"+self._url):
            raise ValueError("Not valid URL: "+url+"!")

        # Try getting the header via http request.head
        try:
            self._header = requests.head("http://www."+self._url, verify=self._verify).headers
        except requests.exceptions.ConnectionError as ce:
            self._header = "Unknown"

        # Add scrapers headers to identify python scraper on websites
        self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
        self._html = None
        self.update()

        # Categorize links into intern - extern and domain
        self._links = {"intern":[], "extern":[], "domain":[]}
        self.findLinks()

        # Empty dict in which media will be inserted
        self._media = {}

    def __str__(self):
        return self._url

    # Getters for private Page content
    def getURL(self):
        return self._url

    def getHTML(self):
        return self._html

    def getDomain(self):
        return self._domain

    def getHeader(self):
        return self._header

    # Return links according to type parameter
    def getLinks(self, intern=True, extern=True, domain=False):
        linklist = []
        if intern:
            linklist += self._links["intern"]
        if extern:
            linklist += self._links["extern"]
        if domain:
            linklist += self._links["domain"]
        return linklist

    # Extracts url out of a domain according to the first backslash occurence that marks the start of the path
    @staticmethod
    def findDomain(url):
        url = url.replace("https://", "")
        url = url.replace("http://", "")
        url = url.replace("www.", "")
        if "/" in url:
            url = url[:url.index("/")]
        return url.lower()

    # Folder is the part of a url without the file, so without the part after the last backslash
    @staticmethod
    def findFolder(url):
        return url[:url.rindex("/")]

    @staticmethod
    def normalize(string):
        return string.replace("https://", "").replace("http://","").replace("www.","")

    # Try scraping the site. If it does not work out, wait some time and try again
    def update(self, tries=5):
        try:
            self._html = requests.get("http://www."+self._url, headers=self._headers, allow_redirects=True, verify=self._verify).text
        except requests.exceptions.ConnectionError as ce:
            if tries > 0:
                time.sleep(1)
                self.update(tries=tries-1)

    # Exctract links from all urls that do not define some well-known filetypes that for sure do not contain any html text (unless .txt or .md could, in theory, contain such links)
    def findLinks(self):
        # print("Finding links of "+self._url)
        # Defined filetypes that are to ignore
        endings = [".jpg", ".jpeg", ".png", ".tiff", ".gif", ".pdf", ".svc", ".ics", ".docx", ".doc", ".mp4", ".mov",
                   ".webm", ".zip", ".ogg"]
        for end in endings:
            if self._url.lower().endswith(end):
                return

        # Parse request as lxml and extract a-tags
        soup = BeautifulSoup(self._html, "lxml")
        links = soup.findAll("a")
        for link in links:
            # Filter out the href link
            link = str(link.get("href")).replace("../", "")
            # Break when the link is None or consists of some javascript that could not be read out
            if link == "None" or "JavaScript:" in link:
                break
            # Categorize link according to its form
            if validators.url(link) and "mailto:" not in link:
                if self._domain in link.lower():
                    self.addInternal(self._domain + link[link.lower().index(self._domain)+len(self._domain):])
                else:
                    self.addExternal((Page.normalize(link)))
            else:
                if validators.url("http://www."+self._domain+"/"+link) and "mailto:" not in link:
                    self.addInternal((self._domain + "/" + link))

    # Add a link to the appropriate list with removing everything before the domain first
    def add(self, list, link):
        link = Page.normalize(link)
        if link[-1] == "/":
            link = link[:-1]
        if "#" in link:
            link = link[:link.index("#")]
        if link not in list:
            list.append(link)

    # Add link to internal links
    def addInternal(self, link):
        self.add(self._links["intern"], link)

    # Add link to external links
    def addExternal(self, link):
        self.add(self._links["extern"], link)
        self.add(self._links["domain"], self.findDomain(link))

    # Filter all internal and external links to certain file endings and returns them
    def filterFiles(self, endlist):
        links = []
        for ending in endlist:
            ending = ending.lower()
            if not ending.startswith("."):
                ending = "."+ending
        for link in self._links["intern"]+self._links["extern"]:
            for ending in endlist:
                if link.lower().endswith(ending):
                    links.append(link)
        return links


# Pagemedie extends the abstract page with media scraping support
class PageMedia(abstractPage):

    def __init__(self, url,verify=True):
        abstractPage.__init__(self, url, verify)

    # Find all images in a page by filtering its links and finding img src tags
    def updateImages(self):
        data = ["jpg","jpeg","png","tiff","svg","webm","gif", ".ico"]
        links = self.findSrc("img")
        new = self.filterFiles(data)
        for link in new:
            if link not in links:
                links.append(link)
        self._media["img"] = links

    # Find all videos in a page by filtering its links and finding video src tags
    def updateVideos(self):
        data = ["avi", "mp4", "mpeg4", "asf", "mov", "qt", "flv", "swf", "wmv"]
        links = self.findSrc("video", "source")
        new = self.filterFiles(data)
        for link in new:
            if link not in links:
                links.append(link)
        self._media["video"] = links

    # Return list of all image links
    def getImages(self):
        if not "img" in self._media.keys() or self._media["img"] == None:
            self.updateImages()
        return self._media["img"]

    # Return list of all video links
    def getVideos(self):
        if not "video" in self._media.keys() or self._media["video"] == None:
            self.updateVideos()
        return self._media["video"]

    # Filter for some specific file types in all links and return the list of all these links
    def get(self, filetype):
        self._media[filetype] = self.filterFiles([filetype])
        return self._media[filetype]

    # Download a file to specified folder
    def download(self, filetype, folder):
        if filetype not in self._media.keys():
            self.get(filetype)
        for link in self._media[filetype]:
            data = OnlineData(link)
            data.download(folder)

    # Find some source that is nested in *tags, like tags("video"->"source"), then "src"!
    def findSrc(self, *tags):
        links = []
        # Sometimes strange Not-implemented error occurs
        try:
            soup = BeautifulSoup(self._html, "html.parser")
        except NotImplementedError as nie:
            print("Not implemented error occurred!")
            print(nie.args)
            return []
        # Filter as long as there are tags left, in the right order
        filter = soup.find_all(tags[0])
        tags = tags[1:]
        for t in range(len(tags)):
            filter_new = []
            for f in range(len(filter)):
                filter_new += filter[f].find_all(tags[t])
            filter = filter_new.copy()
        #Find source in tag and add link according to its structure
        for link in filter:
            img_url = str(link.get("src")).lower()
            if not self._domain in img_url:
                if img_url[0] == "/":
                    self.add(links, self.findFolder(self._url) + img_url)
                elif validators.url(img_url):
                    self.add(links, img_url)
                else:
                    self.add(links, self.findFolder(self._url) + "/" + img_url)
            else:
                self.add(links, img_url)
        return links


# Pagemedia is the version of Page that is always including all functionality, multi-inheritence will be used here later on
class Page(PageMedia):
    def __init__(self, url, verify=True):
        PageMedia.__init__(self, url, verify=True)

# Testing
if __name__=="__main__":
    web = Page("http://mathcourses.ch/mat182.html")
    print(web)
    #web.download("pdf", "mathcourses/pdf-files")