python source code of text

import re

import numpy as np
from nltk import Counter, WordNetLemmatizer

from docqa.configurable import Configurable

"""
Adding classic/shallow text features, I have only done shallow experiments with these 
and not found them to be of much use 
"""


any_num_regex = re.compile("^.*[\d].*$")
int_prefixes = "s|st|th|nd|rd"
all_prefixes = "km|m|v|K|b|bn|billion|k|million|th\+"
careful_num_regex = re.compile("^\+?"
                               "(\d{1,3}(,\d{3})*|\d+|(?=\.))"
                               "(?:(\.\d+)?(?P<p1>%s)?|(?P<p2>%s)?)\+?$" % (all_prefixes, int_prefixes))


def is_number(token):
    match = careful_num_regex.fullmatch(token)
    if match is None:
        return None
    p1 = match.group("p1")
    p2 = match.group("p2")
    if p1 is not None:
        return p1
    elif p2 is not None:
        return p2
    else:
        return ""


class QaTextFeautrizer(Configurable):

    def n_context_features(self):
        raise NotImplementedError()

    def n_question_features(self):
        raise NotImplementedError()

    def get_features(self, question, context):
        """
        return arrays of shape (n_question_words, feature_dim) (n_context_words, feature_dim)
        """
        raise NotImplementedError()


class BasicWordFeatures(QaTextFeautrizer):
    features_names = ["Num", "NumPrefix", "NumExp", "AnyNum", "Punct",
                      "Cap", "Upper", "Alpha", "NonEng", "Len"]

    def __init__(self):
        self.any_num_regex = re.compile("^.*\d.*$")
        self.num_exp = re.compile("^[\d+x\-/\\\=\u2013,:\W]*$")
        self.punc_regex = re.compile("^\W+$")
        self.alpha = re.compile("^[a-z]+$")
        self.any_non_english = re.compile(".*[^a-zA-Z0-9\W].*")
        self.non_english = re.compile("^[^a-zA-Z0-9\W]+$")
        self._feature_cache = {}

    def get_word_features(self, word):
        if word not in self._feature_cache:
            num_prefix = is_number(word)
            non_eng = self.non_english.match(word) is not None
            punc = self.punc_regex.match(word) is not None
            features = np.array([
                num_prefix is not None,
                num_prefix is not None and num_prefix != "",
                self.num_exp.match(word) is not None and num_prefix is None and not punc,
                self.any_num_regex.match(word) is not None and not punc,
                punc,
                word[0].isupper() and word[1:].islower() and not non_eng,
                word.isupper() and not non_eng,
                self.alpha.match(word) is not None,
                non_eng,
                np.log(len(word))
            ])
            self._feature_cache[word] = features
            return features
        return self._feature_cache[word]

    @property
    def n_features(self):
        return 10

    def n_context_features(self):
        return self.n_features

    def n_question_features(self):
        return self.n_features

    def get_sentence_features(self, sent):
        features = np.zeros((len(sent), self.n_features))
        for i, word in enumerate(sent):
            features[i, :self.n_features] = self.get_word_features(word)
        return features

    def get_features(self, question, context):
        return self.get_sentence_features(question), self.get_sentence_features(context)


def extract_year(token):
    ends_with_s = False
    if token[-1] == "s":
        token = token[:-1]
        ends_with_s = True
    try:
        val = int(token)
        if val < 100 and val % 10 == 0 and ends_with_s:
            return 1900 + val
        if 1000 <= val <= 2017:
            return val
        return None
    except ValueError:
        return None


class MatchWordFeatures(QaTextFeautrizer):
    def __init__(self, require_unique_match, lemmatizer="word_net",
                 empty_question_features=False, stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match

    def n_context_features(self):
        return 3

    def n_question_features(self):
        return 3 if self.empty_question_features else 0

    def lemmatize_word(self, word):
        cur = self._cache.get(word)
        if cur is None:
            cur = self._lemmatizer.lemmatize(word)
            self._cache[word] = cur
        return cur

    def get_features(self, question, context):
        stop = set() if self.stop_words is None else self.stop_words.words
        context_features = np.zeros((len(context), 3))

        if not self.require_unique_match:
            question_words = set(x for x in question if x.lower() not in stop)
            quesiton_words_lower = set(x.lower() for x in question)
            quesiton_words_stem = set(self.lemmatize_word(x) for x in quesiton_words_lower)
        else:
            question_words = set(k for k,v in Counter(question).items() if v == 1)
            quesiton_words_lower = set(k for k,v in Counter(x.lower() for x in question_words).items() if v == 1)
            quesiton_words_stem = set(k for k, v in Counter(self.lemmatize_word(x) for x
                                                            in quesiton_words_lower).items() if v == 1)

        for i, word in enumerate(context):
            if word in question_words:
                context_features[i][:3] = 1
            elif word.lower() in quesiton_words_lower:
                context_features[i][:2] = 1
            elif self._lemmatizer.lemmatize(word) in quesiton_words_stem:
                context_features[i][2] = 1

        if self.empty_question_features:
            return np.zeros((len(question), 3)), context_features
        else:
            return np.zeros((len(question), 0)), context_features

    def __setstate__(self, state):
        self.__init__(**state)

    def __getstate__(self):
        state = dict(self.__dict__)
        del state["_cache"]
        del state["_lemmatizer"]
        return state