python source code of __init_

"""Handler for Straits Times."""

import datetime
import re
import time
from difflib import SequenceMatcher
import dateutil.parser
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from nltk.util import ngrams
from comment import Comment
from handlers.abstract_base_handler import AbstractBaseHandler, HandlerError


class STHandler(AbstractBaseHandler):
    """Handler for Straits Times."""

    soup = None
    url = None
    title = None
    MAX_DAYS_OFFSET = 2
    MAX_CURLS_ALLOWED = 5
    MIN_PASSING_SCORE = 0.5
    SLEEP_BETWEEN_CURLS = 1
    ST_PUBLISH_CUTOFF_HOUR = 5
    MODERATED_MAX = 0.8  # we don't want perfect scores overwhelming

    @classmethod
    def handle(cls, url):
        cls.url = url
        cls.soup = cls.make_soup()
        return cls.handle_premium() if cls.is_premium_article() else cls.handle_non_premium()

    @classmethod
    def make_soup(cls):
        html = requests.get(cls.url).text
        soup = BeautifulSoup(html, "html.parser")
        cls.soup = soup
        return soup

    @classmethod
    def is_premium_article(cls):
        """Check if an article is premium."""
        elem = cls.soup.find(name="div", class_="paid-premium st-flag-1")
        return elem is not None

    @classmethod
    def handle_non_premium(cls):
        """Handle a non-premium article."""
        article = Article(cls.url)
        article.download()
        article.parse()

        title = article.title
        body = article.text

        return Comment(title, body)

    @classmethod
    def handle_premium(cls):
        """Handle a premium article."""
        cls.title = cls.soup.find(name="meta", property="og:title")['content']
        print(f"article title: {cls.title}")

        # An article may run for multiple days or be published a day or two later
        for days_offset in range(0, cls.MAX_DAYS_OFFSET):
            # Trying to find a scraped article with the closest title/body to the submission
            possibleMatchingArticles = cls.generate_today_articles(days_offset)
            closest_article = cls.get_matching_article(possibleMatchingArticles)
            if closest_article is not None:
                return closest_article

        print(f"unable to find a suitable article that matches {cls.title}, skipping submission")
        return None

    @classmethod
    def generate_today_articles(cls, days_offset):
        articles_list = BeautifulSoup(cls.get_articles_index(days_offset), "html.parser")
        articles = articles_list.findAll(name="a")
        scored_articles = [(article, cls.similar(article.text, cls.title)) for article in articles]

        # sorted such that scored_articles[0] has the best chance of being the article we want
        scored_articles = sorted(scored_articles, key=lambda x: x[1], reverse=True)

        return scored_articles

    @classmethod
    def get_matching_article(cls, scored_articles):

        # every article in scored_articles has a chance of being the article we want
        # with scored_articles[0] being the most likely and the last element being the least
        # due to rate limits we cannot check all of the articles

        articles_checked_so_far = 0
        while articles_checked_so_far < cls.MAX_CURLS_ALLOWED and len(scored_articles) > 0:
            curr_article = scored_articles.pop(0)
            curr_comment = cls.make_comment(curr_article[0]['href'])
            preview_comment = cls.handle_non_premium()

            if cls.article_bodies_match(preview_comment.body, curr_comment.body):
                return curr_comment

            articles_checked_so_far = articles_checked_so_far + 1
            time.sleep(cls.SLEEP_BETWEEN_CURLS)

    @classmethod
    def article_bodies_match(cls, preview_body, article_body):
        # the higher the score, the better confidence that preview_body is a subset of article_body
        score = 0
        for sentence in cls.split_into_sentences(preview_body):
            weight = len(sentence) / float(len(preview_body))  # longer sentences carry more weight
            score = score + cls.is_needle_in_hay(needle=sentence, hay=article_body) * weight
        return score > cls.MIN_PASSING_SCORE

    @classmethod
    def make_comment(cls, best_candidate):
        url = f"https://www.pressreader.com{best_candidate}"
        article = Article(url, browser_user_agent="Googlebot-News", keep_article_html=True)
        article.download()
        try:
            article.parse()
        except:
            return Comment('', '')

        title = article.title.replace("\xad", "")  # clean the text
        body = article.text.replace("\xad", "")  # clean the text

        print(f"checking the article in this url: {url} with title {title}")
        return Comment(title, body)

    @classmethod
    def get_articles_index(cls, days_offset):
        published_date = cls.get_date(days_offset)
        user_agent = "Googlebot-News"
        url = f"https://www.pressreader.com/singapore/the-straits-times/{published_date}"
        headers = {"User-Agent": user_agent}
        articles_list = requests.get(url, headers=headers).text
        articles_list = articles_list.replace("&#173;", "")  # clean the text
        return articles_list

    @classmethod
    def get_date(cls, days_offset):
        elem = cls.soup.find(name="meta", property="article:published_time")
        raw_date_time = elem['content']
        date_time = dateutil.parser.parse(raw_date_time) + datetime.timedelta(days=days_offset)
        # articles published after the cutoff hour will only appear in the next days index
        if date_time.hour > cls.ST_PUBLISH_CUTOFF_HOUR:
            date_time = date_time + datetime.timedelta(days=1)
        return date_time.strftime('%Y%m%d')

    # is candidate title "similar" to title?
    # some fuzzy matching is used
    # returns 0 <= score <= 1
    # higher score is more similar
    @classmethod
    def similar(cls, candidate, title):
        title = title.lower()
        candidate = candidate.lower()
        articles = ["a", "an", "the"]
        pronouns = ["all", "another", "any", "anybody", "anyone", "anything", "as", "aught", "both", "each", "each",
                    "other", "either", "enough", "everybody", "everyone", "everything", "few", "he", "her", "hers",
                    "herself", "him", "himself", "his", "idem", "it", "its", "itself", "many", "me", "mine", "most",
                    "my", "myself", "naught", "neither", "no", "one", "nobody", "none", "nothing", "nought", "one",
                    "one", "another", "other", "others", "ought", "our", "ours", "ourself", "ourselves", "several",
                    "she", "some", "somebody", "someone", "something", "somewhat", "such", "suchlike", "that", "thee",
                    "their", "theirs", "theirself", "theirselves", "them", "themself", "themselves", "there", "these",
                    "they", "thine", "this", "those", "thou", "thy", "thyself", "us", "we", "what", "whatever",
                    "whatnot", "whatsoever", "whence", "where", "whereby", "wherefrom", "wherein", "whereinto",
                    "whereof", "whereon", "wherever", "wheresoever", "whereto", "whereunto", "wherewith", "wherewithal",
                    "whether", "which", "whichever", "whichsoever", "who", "whoever", "whom", "whomever", "whomso",
                    "whomsoever", "whose", "whosever", "whosesoever", "whoso", "whosoever", "ye", "yon", "yonder",
                    "you", "your", "yours", "yourself", "yourselves"]
        prepositions = ["of", "with", "at", "from", "into", "during", "including", "until", "against", "among",
                        "throughout", "despite", "towards", "upon", "concerning", "to", "in", "for", "on", "by",
                        "about", "like", "through", "over", "before", "between", "after", "since", "without", "under",
                        "within", "along", "following", "across", "behind", "beyond", "plus", "except", "but", "up",
                        "out", "around", "down", "off", "above", "near"]
        conjunctions = ["for", "and", "nor", "but", "or", "yet", "so", "after", "although", "as", "as", "if", "as",
                        "long", "as", "as", "much", "as", "as", "soon", "as", "as", "though", "because", "before", "by",
                        "the", "time", "even", "if", "even", "though", "if", "in", "order", "that", "in", "case",
                        "lest", "once", "only", "if", "provided", "that", "since", "so", "that", "than", "that",
                        "though", "till", "unless", "until", "when", "whenever", "where", "wherever", "while", "both",
                        "and", "either", "or", "neither", "nor", "not", "only", "but", "also", "whether", "or"]
        redherrings = ["singapore", "singaporeans", "s'pore", "says", "is", "has", "are", "am", "were", "been", "have",
                       "had", "having"]
        blacklist = set(articles + pronouns + prepositions + conjunctions + redherrings)
        score = 0
        words_scored = 0

        for word in re.compile("[ '.:\;,.!&\"]").split(candidate):
            if word in blacklist:
                continue
            curr_score = cls.is_needle_in_hay(needle=word, hay=title)
            curr_score = (curr_score - 0.5) * 2  # ranges 0.5-1, so normalise to 0-1
            if curr_score < 0.5:
                continue
            words_scored = words_scored + 1
            score = score + curr_score
        if words_scored > 0:
            final_score = (score / words_scored)
        else:
            final_score = 0
        return cls.MODERATED_MAX if final_score == 1 else final_score

    # https://stackoverflow.com/a/31433394
    # fuzzily searches for a needle in a haystack and returns the confidence that needle was found
    @classmethod
    def is_needle_in_hay(cls, needle, hay):

        needle_length = len(needle.split())
        max_sim_val = 0

        for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
            hay_ngram = u" ".join(ngram)
            similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
            if similarity > max_sim_val:
                max_sim_val = similarity
                max_sim_string = hay_ngram

        return max_sim_val  # how confident are we that needle was found in hay

    # https://stackoverflow.com/a/31505798
    # given a string paragraph, return a list of sentences
    @classmethod
    def split_into_sentences(cls, text):
        alphabets = "([A-Za-z])"
        prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
        suffixes = "(Inc|Ltd|Jr|Sr|Co)"
        starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
        acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
        websites = "[.](com|net|org|io|gov)"
        text = " " + text + "  "
        text = text.replace("\n", " ")
        text = re.sub(prefixes, "\\1<prd>", text)
        text = re.sub(websites, "<prd>\\1", text)
        if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>")
        text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
        text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
        text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
        text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
        text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
        if "”" in text: text = text.replace(".”", "”.")
        if "\"" in text: text = text.replace(".\"", "\".")
        if "!" in text: text = text.replace("!\"", "\"!")
        if "?" in text: text = text.replace("?\"", "\"?")
        text = text.replace(".", ".<stop>")
        text = text.replace("?", "?<stop>")
        text = text.replace("!", "!<stop>")
        text = text.replace("<prd>", ".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences