python source code of findlinks

# -*- coding:utf-8 -*-
# Author: Muhe
# Version:2.0
# 不爬相似地址了。
# 有时候xml解析会报错

from __future__ import division
import lxml
from lxml.html import fromstring
import requests
import re
import mechanize
import operator
import sys
import os
from time import sleep

class SameFileError(Exception): pass
class NoneTypeError(Exception): pass


global formlist
reqlist = []
feature_hub = []
_Root = os.getcwd()

# 请求一个链接，返回HTTP类型、主机Host名、页面的二进制数据
def _requestData(url):
    headers = {
        'Connection': 'keep-alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept-Encoding': 'gzip, deflate',
        # 'Cookie': '',
        'Upgrade-Insecure-Requests': '1'
    }
    try:
        req = requests.get(url, headers=headers,timeout=5)
    except:
        return 'err ', url, None
    return req.status_code, url, req.text


def getLinks(self):
    try:
        resType, resHost, resData = _requestData(self)
        if not resData:
            raise NoneTypeError
        doc = lxml.html.document_fromstring(resData)
        tags = ['a', 'iframe', 'frame']
        doc.make_links_absolute(resHost)
    except Exception,NoneTypeError:
        return resHost, None
    links = doc.iterlinks()
    trueLinks = []
    for l in links:
        if l[0].tag in tags:
            trueLinks.append(l[2])
    return trueLinks, resData  # 要确保是绝对路径


def correct_url(url):
    if 'http://' not in url:
        url = 'http://' + url.strip()
    return url


def middle_name(url):
    # middle_name = re.findall(r'[\/\.]([\s\S]+)\.', url)
    # tidy the url
    url_tidy = url.strip('www.')
    url_tidy = url_tidy.strip('http://')
    url_tidy = url_tidy.strip('https://')
    # dot = re.findall('\.', url_tidy)
    re_url = re.compile(r'([-\w]+).')
    try:
        middle = re_url.match(url_tidy).groups(0)
    except Exception:
        return None
    return middle[0]

def getdiffer(list1,list2):
    if len(list1)<len(list2):
        length = len(list1)
        if (len(list2)-len(list1))>5:
            return False
    else:
        length = len(list2)
        if (len(list1)-len(list2))>5:
            return False
    return length

def str_compare(str1,str2,accuracy=0.80):
    list1 = list(str1)
    list2 = list(str2)
    score = 0
    # print "comparing:",str1,str2
    total = len(list1)
    length = getdiffer(list1,list2)
    if length is False:return False
    for i in xrange(length):
        if list1[i] == list2[i]:
            score += 1
    ratio = score/total
    if ratio > accuracy:
        # print "similier"
        return True
    return False


def feature_match(link):
	global url_old
	for link_old in url_old:
		if str_compare(link_old,link):
			return True
	return False


def feature_catch(link):
    pass


def feature_filter(link):
    # 检测是否匹配已有特征
    if feature_match(link):
        return True
    return False


def single_url(url):
# 获取单一url入口
    # try:
    global url_ad
    global url_old
    global middle
    url = correct_url(url)
    # 获取页面上链接、数据
    url_links, data = getLinks(url)
    if data is None:
        return
    for link in url_links:
        sys.stdout.write('!')
        if link == url:
            continue
        if link in url + '/index':
            continue
        if 'javascript' in link:
            continue
        if link in url_old:
            continue
        if middle not in link:
            continue
        if feature_filter(link):
            continue
        try:
            print "\n",link
        except Exception:
            pass
        with open(_Root + "\\Results\\" + middle + "_links.txt","a") as f:
            print "writing: ",link
            f.write(link+"\n")
        # if link not in url_old and link not in url_add and 'http://www.xxx.com' in link:
        # print link
        # Findsubmit(link)
        url_add.append(link)
        url_old.append(link)  # 因为已经加入到add，所以算是已知url，就加入old里。
        # except Exception, e:
        #     print e
        #     pass


def Findsubmit(link):
    global reqlist
    try:
        br = mechanize.Browser()  # initiating the browser
        br._factory.is_html = True
        br.addheaders = [('User-agent',
                          'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        br.open(str(link), timeout=15)
        if br.forms():
            params = list(br.forms())
            for par in params:
                for p in par.controls:
                    ps = str(p)
                    # print p.name
                    if 'TextControl' in ps:
                        param = str(p.name)
                        reqstr = par.action + par.method + param
                        if reqstr not in reqlist:
                            reqlist.append(reqstr)
                            testxss(par.action, par.method, param)
    except Exception, e:
        print e
        pass


def testxss(action, method, param):
    method = method.lower()
    headers = {'content-type': 'application/json',
               'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
    if method == "get":
        print "=" * 10 + "get start" + "=" * 10
        url = action + "/?" + param + "=test1234"
        print url
        # response = requests.get(url,headers=headers)
        # print response.text
        print "=" * 10 + "get end" + "=" * 10
    if method == "post":
        data = {'{0}'.format(param): "test"}
        print "=" * 10 + "post start" + "=" * 10
        print action
        print data
        # response = requests.post(action,data=data,headers=headers)
        # print response.text
        print "=" * 10 + "post end" + "=" * 10


def findlink(input,level=2):
    global url_new
    global url_old
    global url_add
    global middle
    # 总入口
    url_new = []  # level_i级的
    url_old = []  # 所有已经爬过的
    url_add = []  # 每个level_i级新增的链接
    # url = 'http://www.xxx.com'
    url = input
    middle = middle_name(url)
    url_new.append(url)
    for level_i in xrange(level):
        for i in xrange(len(url_new)):
            url_new_i = url_new[i]
            url_old.append(url_new_i)
            sleep(0.5)
            single_url(url_new_i)
        url_new = url_add
    # with open(middle + "_links.txt","w") as f:
        # for line in url_old:
            # f.write(line+"\n")


if __name__ == '__main__':
    try:
        url=sys.argv[1]
    except Exception:
        print "Usage: python findlinks.py www.example.com"
        exit()
    # url = 'http://www.xxx.com'
    findlink(url,10)