python source code of search

"""
Contains all functions used for searching Stack Overflow and Google
"""

import os
import random
import re
import sys

from bs4 import BeautifulSoup
import requests
import urwid

import socli.printer
import socli.tui

uas = []  # User agent list
header = {}  # Request header
google_search = True

so_url = "http://stackoverflow.com"  # Site URL
so_qurl = "http://stackoverflow.com/search?q="  # Query URL
so_burl = "https://stackoverflow.com/?tab="  # Assuming browse URL
google_search_url = "https://www.google.com/search?q=site:www.stackoverflow.com+"  # Google search query URL


def get_questions_for_query(query, count=10):
    """
    Fetch questions for a query using Stack Overflow default search mechanism.
    Returned question urls are relative to SO homepage.
    At most 10 questions are returned. (Can be altered by passing count)
    :param query: User-entered query string
    :return: list of [ (question_text, question_description, question_url) ]
    """

    questions = []
    random_headers()

    search_res = requests.get(so_qurl + query, headers=header)
    captcha_check(search_res.url)
    soup = BeautifulSoup(search_res.text, 'html.parser')
    try:
        soup.find_all("div", class_="question-summary")[0]  # For explicitly raising exception
    except IndexError:
        socli.printer.print_warning("No results found...")
        sys.exit(0)
    tmp = (soup.find_all("div", class_="question-summary"))
    tmp1 = (soup.find_all("div", class_="excerpt"))
    i = 0
    while i < len(tmp) and i < count: # iterate over and limit results
        question_text = ' '.join((tmp[i].a.get_text()).split())
        question_text = question_text.replace("Q: ", "")
        question_desc = (tmp1[i].get_text()).replace("'\r\n", "")
        question_desc = ' '.join(question_desc.split())
        question_local_url = tmp[i].a.get("href")
        questions.append((question_text, question_desc, question_local_url))
        i += 1

    return questions


def get_questions_for_query_google(query, count=10):
    """
    Fetch questions for a query using Google search.
    Returned question urls are URLS to SO homepage.
    At most 10 questions are returned. (Can be altered by passing count)
    :param query: User-entered query string
    :return: list of [ (question_text, question_description, question_url) ]
    """
    i = 0
    questions = []
    random_headers()
    search_results = requests.get(google_search_url + query, headers=header)
    captcha_check(search_results.url)
    soup = BeautifulSoup(search_results.text, 'html.parser')
    try:
        soup.find_all("div", class_="g")[0]  # For explicitly raising exception
    except IndexError:
        socli.printer.print_warning("No results found...")
        sys.exit(0)
    for result in soup.find_all("div", class_="g"):
        if i == count:
            break
        try:
            question_title = result.find("div", class_="r").find("h3").get_text().replace(' - Stack Overflow', '')
            question_desc = result.find("span", class_="st").get_text()
            if question_desc == "":  # For avoiding instant answers
                raise NameError  # Explicit raising
            question_url = result.find("a").get("href")  # Retrieves the Stack Overflow link
            question_url = fix_google_url(question_url.lower())

            if question_url is None:
                i = i - 1
                continue

            questions.append([question_title, question_desc, question_url])
            i += 1
        except NameError:
            continue
        except AttributeError:
            continue

    # Check if there are any valid question posts
    if not questions:
        socli.printer.print_warning("No results found...")
        sys.exit(0)
    return questions


def get_question_stats_and_answer(url):
    """
    Fetch the content of a StackOverflow page for a particular question.
    :param url: full url of a StackOverflow question
    :return: tuple of ( question_title, question_desc, question_stats, answers )
    """
    random_headers()
    res_page = requests.get(url, headers=header)
    captcha_check(res_page.url)
    soup = BeautifulSoup(res_page.text, 'html.parser')
    question_title, question_desc, question_stats = get_stats(soup)
    answers = [s.get_text() for s in soup.find_all("div", class_="post-text")][
              1:]  # first post is question, discard it.
    if len(answers) == 0:
        answers.append('No answers for this question ...')
    return question_title, question_desc, question_stats, answers


def get_stats(soup):
    """
    Get Question stats
    :param soup:
    :return:
    """
    question_title = (soup.find_all("a", class_="question-hyperlink")[0].get_text())
    question_stats = (soup.find_all("div", class_="js-vote-count")[0].get_text())
    try:

        asked_info = soup.find("time").parent.get_text()
        active_info = soup.find("time").parent.findNext('div').get_text()
        viewed_info = soup.find("time").parent.findNext('div').findNext('div').get_text()
        question_stats = "Votes " + question_stats + " | " + asked_info + " | " + active_info + " | " + viewed_info
    except:
        question_stats = "Could not load statistics."
    question_desc = (soup.find_all("div", class_="post-text")[0])
    add_urls(question_desc)
    question_desc = question_desc.get_text()
    question_stats = ' '.join(question_stats.split())
    return question_title, question_desc, question_stats


def add_urls(tags):
    """
    Adds the URL to any hyperlinked text found in a question
    or answer.
    :param tags:
    """
    images = tags.find_all("a")

    for image in images:
        if hasattr(image, "href"):
            image.string = "{} [{}]".format(image.text, image['href'])


def socli_interactive_windows(query):
    """
    Interactive mode basic implementation for windows, since urwid doesn't support CMD.
    :param query:
    :return:
    """
    try:
        search_res = requests.get(so_qurl + query)
        captcha_check(search_res.url)
        soup = BeautifulSoup(search_res.text, 'html.parser')
        try:
            soup.find_all("div", class_="question-summary")[0]  # For explictly raising exception
            tmp = (soup.find_all("div", class_="question-summary"))
            tmp1 = (soup.find_all("div", class_="excerpt"))
            i = 0
            question_local_url = []
            print(socli.printer.bold("\nSelect a question below:\n"))
            while i < len(tmp):
                if i == 10: break  # limiting results
                question_text = ' '.join((tmp[i].a.get_text()).split())
                question_text = question_text.replace("Q: ", "")
                question_desc = (tmp1[i].get_text()).replace("'\r\n", "")
                question_desc = ' '.join(question_desc.split())
                socli.printer.print_warning(str(i + 1) + ". " + socli.printer.display_str(question_text))
                question_local_url.append(tmp[i].a.get("href"))
                print("  " + socli.printer.display_str(question_desc) + "\n")
                i = i + 1
            try:
                op = int(socli.printer.inputs("\nType the option no to continue or any other key to exit:"))
                while 1:
                    if (op > 0) and (op <= i):
                        socli.printer.display_results(so_url + question_local_url[op - 1])
                        cnt = 1  # this is because the 1st post is the question itself
                        while 1:
                            global tmpsoup
                            qna = socli.printer.inputs(
                                "Type " + socli.printer.bold("o") + " to open in browser, " + socli.printer.bold("n") + " to next answer, " + socli.printer.bold(
                                    "b") + " for previous answer or any other key to exit:")
                            if qna in ["n", "N"]:
                                try:
                                    answer = (tmpsoup.find_all("div", class_="post-text")[cnt + 1].get_text())
                                    socli.printer.print_green("\n\nAnswer:\n")
                                    print("-------\n" + answer + "\n-------\n")
                                    cnt = cnt + 1
                                except IndexError as e:
                                    socli.printer.print_warning(" No more answers found for this question. Exiting...")
                                    sys.exit(0)
                                continue
                            elif qna in ["b", "B"]:
                                if cnt == 1:
                                    socli.printer.print_warning(" You cant go further back. You are on the first answer!")
                                    continue
                                answer = (tmpsoup.find_all("div", class_="post-text")[cnt - 1].get_text())
                                socli.printer.print_green("\n\nAnswer:\n")
                                print("-------\n" + answer + "\n-------\n")
                                cnt = cnt - 1
                                continue
                            elif qna in ["o", "O"]:
                                import webbrowser
                                if sys.platform.startswith('darwin'):
                                    browser = webbrowser.get('safari')
                                else:
                                    browser = webbrowser.get()
                                socli.printer.print_warning("Opening in your browser...")
                                browser.open(so_url + question_local_url[op - 1])
                            else:
                                break
                        sys.exit(0)
                    else:
                        op = int(input("\n\nWrong option. select the option no to continue:"))
            except Exception as e:
                socli.printer.showerror(e)
                socli.printer.print_warning("\n Exiting...")
                sys.exit(0)
        except IndexError:
            socli.printer.print_warning("No results found...")
            sys.exit(0)

    except UnicodeEncodeError:
        socli.printer.print_warning("\n\nEncoding error: Use \"chcp 65001\" command before using socli...")
        sys.exit(0)
    except requests.exceptions.ConnectionError:
        socli.printer.print_fail("Please check your internet connectivity...")
    except Exception as e:
        socli.printer.showerror(e)
        sys.exit(0)


def socli_interactive(query):
    """
    Interactive mode
    :return:
    """
    if sys.platform == 'win32':
        return socli_interactive_windows(query)

    class SelectQuestionPage(urwid.WidgetWrap):

        def display_text(self, index, question):
            question_text, question_desc, _ = question
            text = [
                ("warning", u"{}. {}\n".format(index, question_text)),
                question_desc + "\n",
                ]
            return text

        def __init__(self, questions):
            self.questions = questions
            self.cachedQuestions = [None for _ in range(10)]
            widgets = [self.display_text(i, q) for i, q in enumerate(questions)]
            self.questions_box = socli.tui.ScrollableTextBox(widgets)
            self.header = socli.tui.UnicodeText(('less-important', 'Select a question below:\n'))
            self.footerText = '0-' + str(len(self.questions) - 1) + ': select a question, any other key: exit.'
            self.errorText = socli.tui.UnicodeText.to_unicode('Question numbers range from 0-' +
                                                        str(len(self.questions) - 1) +
                                                        ". Please select a valid question number.")
            self.footer = socli.tui.UnicodeText(self.footerText)
            self.footerText = socli.tui.UnicodeText.to_unicode(self.footerText)
            frame = urwid.Frame(header=self.header,
                                body=urwid.Filler(self.questions_box, height=('relative', 100), valign='top'),
                                footer=self.footer)
            urwid.WidgetWrap.__init__(self, frame)

        # Override parent method
        def selectable(self):
            return True

        def keypress(self, size, key):
            if key in '0123456789':
                try:
                    question_url = self.questions[int(key)][2]
                    self.footer.set_text(self.footerText)
                    self.select_question(question_url, int(key))
                except IndexError:
                    self.footer.set_text(self.errorText)
            elif key in {'down', 'up'}:
                self.questions_box.keypress(size, key)
            else:
                raise urwid.ExitMainLoop()

        def select_question(self, url, index):
            if self.cachedQuestions[index] is not None:
                socli.tui.question_post = self.cachedQuestions[index]
                socli.tui.MAIN_LOOP.widget = socli.tui.question_post
            else:
                if not google_search:
                    url = so_url + url
                question_title, question_desc, question_stats, answers = get_question_stats_and_answer(url)
                socli.tui.question_post = socli.tui.QuestionPage((answers, question_title, question_desc, question_stats, url))
                self.cachedQuestions[index] = socli.tui.question_post
                socli.tui.MAIN_LOOP.widget = socli.tui.question_post

    socli.tui.display_header = socli.tui.Header()

    try:
        if google_search:
            questions = get_questions_for_query_google(query)
        else:
            questions = get_questions_for_query(query)
        question_page = SelectQuestionPage(questions)
        socli.tui.MAIN_LOOP = socli.tui.EditedMainLoop(question_page, socli.printer.palette)
        socli.tui.MAIN_LOOP.run()

    except UnicodeEncodeError:
        socli.printer.print_warning("\n\nEncoding error: Use \"chcp 65001\" command before using socli...")
        sys.exit(0)
    except requests.exceptions.ConnectionError:
        socli.printer.print_fail("Please check your internet connectivity...")
    except Exception as e:
        socli.printer.showerror(e)
        print("exiting...")
        sys.exit(0)


def socli_manual_search(query, rn):
    """
    Manual search by question index
    :param query:
    :param rn:
    :return:
    """
    if rn < 1:
        socli.printer.print_warning(
            "Count starts from 1. Use: \"socli -i 2 -q python for loop\" for the 2nd result for the query")
        sys.exit(0)
    query = socli.printer.urlencode(query)
    try:
        random_headers()
        # Set count = 99 so you can choose question numbers higher than 10
        count = 99
        res_url = None
        try:
            if google_search:
                questions = get_questions_for_query_google(query, count)
                res_url = questions[rn - 1][2]
            else:
                questions = get_questions_for_query(query, count)
                res_url = so_url + questions[rn - 1][2]
            socli.printer.display_results(res_url)
        except IndexError:
            socli.printer.print_warning("No results found...")
            sys.exit(1)
    except UnicodeEncodeError:
        socli.printer.print_warning("Encoding error: Use \"chcp 65001\" command before "
                         "using socli...")
        sys.exit(0)
    except requests.exceptions.ConnectionError:
        socli.printer.print_fail("Please check your internet connectivity...")
    except Exception as e:
        socli.printer.showerror(e)
        sys.exit(0)


def fix_google_url(url):
    """
    Fixes the url extracted from HTML when
    performing a google search
    :param url:
    :return: Correctly formatted URL to be used in requests.get
    """
    if "&sa=" in url:
        url = url.split("&")[0]
    if "/url?q=" in url[0:7]:
        url = url[7:]  # Removes the "/url?q=" prefix

    if url[:30] == "http://www.google.com/url?url=":
        # Used to get rid of this header and just retrieve the Stack Overflow link
        url = url[30:]

    if "http" not in url[:4]:
        url = "https://" + url  # Add the protocol if it doesn't already exist

    # Makes sure that we stay in the questions section of Stack Overflow
    if not bool(re.search(r"/questions/[0-9]+", url)) and not bool(re.search(r"\.com/a/[0-9]", url)):
        return None

    if url[:17] == "https:///url?url=":  # Resolves rare bug in which this is a prefix
        url = url[17:]

    return url


def captcha_check(url):
    """
    Exits program when their is a captcha. Prevents errors.
    Users will have to manually verify their identity.
    :param url: URL from Stack Overflow
    :return:
    """
    if google_search:
        google_error_display_msg = "Google thinks you're a bot because you're issuing too many queries too quickly!" + \
                                   " Now you'll have to wait about an hour before you're unblocked... :(. " \
                                   "Use the -s tag to search via Stack Overflow instead."
        # Check if google detects user as a bot
        if re.search(r"ipv4\.google\.com/sorry", url):
            socli.printer.print_warning(google_error_display_msg)
            exit(0)
    else:
        if re.search(r"\.com/nocaptcha", url):  # Searching for stackoverflow captcha check
            socli.printer.print_warning("StackOverflow captcha check triggered. Please wait a few seconds before trying again.")
            exit(0)


def load_user_agents():
    """
    Loads the list of user agents from user_agents.txt
    :return:
    """
    global uas
    uas = []
    with open(os.path.join(os.path.dirname(__file__), "user_agents.txt"), 'rb') as uaf:
        for ua in uaf.readlines():
            if ua:
                uas.append(ua.rstrip())
    random.shuffle(uas)


def random_headers():
    """
    Sets header variable to a random value
    :return:
    """
    global uas
    global header
    ua = random.choice(uas)
    header = {"User-Agent": ua}