python source code of ngram

import time
from collections import Counter

import logging
import numpy as np
import os
import typing
import warnings
from string import punctuation
from typing import Any, Dict, List, Optional, Text

from rasa_nlu import utils
from rasa_nlu.config import RasaNLUModelConfig
from rasa_nlu.featurizers import Featurizer
from rasa_nlu.training_data import Message, TrainingData

logger = logging.getLogger(__name__)

if typing.TYPE_CHECKING:
    from rasa_nlu.model import Metadata


class NGramFeaturizer(Featurizer):

    provides = ["text_features"]

    requires = ["spacy_doc"]

    defaults = {
        # defines the maximum number of ngrams to collect and add
        # to the featurization of a sentence
        "max_number_of_ngrams": 10,

        # the minimal length in characters of an ngram to be eligible
        "ngram_min_length": 3,

        # the maximal length in characters of an ngram to be eligible
        "ngram_max_length": 17,

        # the minimal number of times an ngram needs to occur in the
        # training data to be considered as a feature
        "ngram_min_occurrences": 5,

        # during cross validation (used to detect which ngrams are most
        # valuable) every intent with fever examples than this config
        # value will be excluded
        "min_intent_examples": 4,
    }

    def __init__(self, component_config=None):
        super(NGramFeaturizer, self).__init__(component_config)

        self.best_num_ngrams = None
        self.all_ngrams = None

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["spacy", "sklearn", "cloudpickle"]

    def train(self, training_data: TrainingData, cfg: RasaNLUModelConfig,
              **kwargs: Any):

        start = time.time()
        self.train_on_sentences(training_data.intent_examples)
        logger.debug("Ngram collection took {} seconds"
                     "".format(time.time() - start))

        for example in training_data.training_examples:
            updated = self._text_features_with_ngrams(example,
                                                      self.best_num_ngrams)
            example.set("text_features", updated)

    def process(self, message: Message, **kwargs: Any):

        updated = self._text_features_with_ngrams(message, self.best_num_ngrams)
        message.set("text_features", updated)

    def _text_features_with_ngrams(self, message, max_ngrams):

        ngrams_to_use = self._ngrams_to_use(max_ngrams)

        if ngrams_to_use is not None:
            extras = np.array(self._ngrams_in_sentence(message, ngrams_to_use))
            return self._combine_with_existing_text_features(message, extras)
        else:
            return message.get("text_features")

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Optional[Text] = None,
             model_metadata: Optional['Metadata'] = None,
             cached_component: Optional['NGramFeaturizer'] = None,
             **kwargs: Any
             ) -> 'NGramFeaturizer':

        file_name = meta.get("file")
        featurizer_file = os.path.join(model_dir, file_name)

        if os.path.exists(featurizer_file):
            return utils.pycloud_unpickle(featurizer_file)
        else:
            return NGramFeaturizer(meta)

    def persist(self,
                file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:
        """Persist this model into the passed directory."""

        file_name = file_name + ".pkl"
        featurizer_file = os.path.join(model_dir, file_name)
        utils.pycloud_pickle(featurizer_file, self)
        return {"file": file_name}

    def train_on_sentences(self, examples):
        labels = [e.get("intent") for e in examples]
        self.all_ngrams = self._get_best_ngrams(examples, labels)
        self.best_num_ngrams = self._cross_validation(examples, labels)

    def _ngrams_to_use(self, num_ngrams):
        if num_ngrams == 0 or self.all_ngrams is None:
            return []
        elif num_ngrams is not None:
            return self.all_ngrams[:num_ngrams]
        else:
            return self.all_ngrams

    def _get_best_ngrams(self, examples, labels):
        """Return an ordered list of the best character ngrams."""

        oov_strings = self._remove_in_vocab_words(examples)
        ngrams = self._generate_all_ngrams(
            oov_strings, self.component_config["ngram_min_length"])
        return self._sort_applicable_ngrams(ngrams, examples, labels)

    def _remove_in_vocab_words(self, examples):
        """Automatically removes words with digits in them, that may be a
        hyperlink or that _are_ in vocabulary for the nlp."""

        new_sents = []
        for example in examples:
            new_sents.append(self._remove_in_vocab_words_from_sentence(example))
        return new_sents

    @staticmethod
    def _is_ngram_worthy(token):
        """Decide if we should use this token for ngram counting.

        Excludes every word with digits in them, hyperlinks or
        an assigned word vector."""
        return (not token.has_vector and not token.like_url and not
                token.like_num and not token.like_email and not
                token.is_punct)

    def _remove_in_vocab_words_from_sentence(self, example):
        """Filter for words that do not have a word vector."""

        cleaned_tokens = [token
                          for token in example.get("spacy_doc")
                          if self._is_ngram_worthy(token)]

        # keep only out-of-vocab 'non_word' words
        non_words = ' '.join([t.text for t in cleaned_tokens])

        # remove digits and extra spaces
        non_words = ''.join([letter
                             for letter in non_words
                             if not letter.isdigit()])
        non_words = ' '.join([word
                              for word in non_words.split(' ')
                              if word != ''])

        # add cleaned sentence to list of these sentences
        return non_words

    def _intents_with_enough_examples(self, labels, examples):
        """Filter examples where we do not have a min number of examples."""

        min_intent_examples = self.component_config["min_intent_examples"]
        usable_labels = []

        for label in np.unique(labels):
            lab_sents = np.array(examples)[np.array(labels) == label]
            if len(lab_sents) < min_intent_examples:
                continue
            usable_labels.append(label)

        return usable_labels

    def _rank_ngrams_using_cv(self, examples, labels, list_of_ngrams):
        from sklearn import linear_model

        X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
        y = self.encode_labels(labels)

        clf = linear_model.RandomizedLogisticRegression(C=1)
        clf.fit(X, y)

        # sort the ngrams according to the classification score
        scores = clf.scores_
        sorted_idxs = sorted(enumerate(scores), key=lambda x: -1 * x[1])
        sorted_ngrams = [list_of_ngrams[i[0]] for i in sorted_idxs]

        return sorted_ngrams

    def _sort_applicable_ngrams(self, ngrams_list, examples, labels):
        """Given an intent classification problem and a list of ngrams,

        creates ordered list of most useful ngrams."""

        if not ngrams_list:
            return []

        # make sure we have enough labeled instances for cv
        usable_labels = self._intents_with_enough_examples(labels, examples)

        mask = [label in usable_labels for label in labels]
        if any(mask) and len(usable_labels) >= 2:
            try:
                examples = np.array(examples)[mask]
                labels = np.array(labels)[mask]

                return self._rank_ngrams_using_cv(
                    examples, labels, ngrams_list)
            except ValueError as e:
                if "needs samples of at least 2 classes" in str(e):
                    # we got unlucky during the random
                    # sampling :( and selected a slice that
                    # only contains one class
                    return []
                else:
                    raise e
        else:
            # there is no example we can use for the cross validation
            return []

    def _ngrams_in_sentences(self, examples, ngrams):
        """Given a set of sentences, returns a feature vector for each sentence.

        The first $k$ elements are from the `intent_features`,
        the rest are {1,0} elements denoting whether an ngram is in sentence."""

        all_vectors = []
        for example in examples:
            presence_vector = self._ngrams_in_sentence(example, ngrams)
            all_vectors.append(presence_vector)
        return all_vectors

    def _ngrams_in_sentence(self, example, ngrams):
        """Given a set of sentences, return a vector indicating ngram presence.

        The vector will return 1 entries if the corresponding ngram is
        present in the sentence and 0 if it is not."""

        cleaned_sentence = self._remove_in_vocab_words_from_sentence(example)
        presence_vector = np.zeros(len(ngrams))
        idx_array = [idx
                     for idx in range(len(ngrams))
                     if ngrams[idx] in cleaned_sentence]
        presence_vector[idx_array] = 1
        return presence_vector

    def _generate_all_ngrams(self, list_of_strings, ngram_min_length):
        """Takes a list of strings and generates all character ngrams.

        Generated ngrams are at least 3 characters (and at most 17),
        occur at least 5 times and occur independently of longer
        superset ngrams at least once."""

        features = {}
        counters = {ngram_min_length - 1: Counter()}
        max_length = self.component_config["ngram_max_length"]

        for n in range(ngram_min_length, max_length):
            candidates = []
            features[n] = []
            counters[n] = Counter()

            # generate all possible n length ngrams
            for text in list_of_strings:
                text = text.replace(punctuation, ' ')
                for word in text.lower().split(' '):
                    cands = [word[i:i + n] for i in range(len(word) - n)]
                    for cand in cands:
                        counters[n][cand] += 1
                        if cand not in candidates:
                            candidates.append(cand)

            min_count = self.component_config["ngram_min_occurrences"]
            # iterate over these candidates picking only the applicable ones
            for can in candidates:
                if counters[n][can] >= min_count:
                    features[n].append(can)
                    begin = can[:-1]
                    end = can[1:]
                    if n >= ngram_min_length:
                        if (counters[n - 1][begin] == counters[n][can] and
                                begin in features[n - 1]):
                            features[n - 1].remove(begin)
                        if (counters[n - 1][end] == counters[n][can] and
                                end in features[n - 1]):
                            features[n - 1].remove(end)

        return [item for sublist in list(features.values()) for item in sublist]

    @staticmethod
    def _collect_features(examples):
        if examples:
            collected_features = [e.get("text_features")
                                  for e in examples
                                  if e.get("text_features") is not None]
        else:
            collected_features = []

        if collected_features:
            return np.stack(collected_features)
        else:
            return None

    def _append_ngram_features(self, examples, existing_features, max_ngrams):
        ngrams_to_use = self._ngrams_to_use(max_ngrams)
        extras = np.array(self._ngrams_in_sentences(examples,
                                                    ngrams_to_use))
        if existing_features is not None:
            return np.hstack((existing_features, extras))
        else:
            return extras

    @staticmethod
    def _num_cv_splits(y):
        return min(10, np.min(np.bincount(y))) if y.size > 0 else 0

    @staticmethod
    def encode_labels(labels):
        from sklearn import preprocessing

        intent_encoder = preprocessing.LabelEncoder()
        intent_encoder.fit(labels)
        return intent_encoder.transform(labels)

    def _score_ngram_selection(self, examples, y, existing_text_features,
                               cv_splits, max_ngrams):
        from sklearn.model_selection import cross_val_score
        from sklearn.linear_model import LogisticRegression

        if existing_text_features is None:
            return 0.0

        clf = LogisticRegression(class_weight='balanced')

        no_ngrams_X = self._append_ngram_features(
            examples, existing_text_features, max_ngrams)
        return np.mean(cross_val_score(clf, no_ngrams_X, y, cv=cv_splits))

    @staticmethod
    def _generate_test_points(max_ngrams):
        """Generate a list of increasing numbers.

        They are used to take the best n ngrams and evaluate them. This n
        is varied to find the best number of ngrams to use. This function
        defines the number of ngrams that get tested."""

        possible_ngrams = np.linspace(0, max_ngrams, 8)
        return np.unique(list(map(int, np.floor(possible_ngrams))))

    def _cross_validation(self, examples, labels):
        """Choose the best number of ngrams to include in bow.

        Given an intent classification problem and a set of ordered ngrams
        (ordered in terms of importance by pick_applicable_ngrams) we
        choose the best number of ngrams to include in our bow vecs
        by cross validation."""

        max_ngrams = self.component_config["max_number_of_ngrams"]

        if not self.all_ngrams:
            logger.debug("Found no ngrams. Using existing features.")
            return 0

        existing_text_features = self._collect_features(examples)

        y = self.encode_labels(labels)
        cv_splits = self._num_cv_splits(y)

        if cv_splits >= 3:
            logger.debug("Started ngram cross-validation to find b"
                         "est number of ngrams to use...")

            scores = []
            num_ngrams = self._generate_test_points(max_ngrams)
            for n in num_ngrams:
                score = self._score_ngram_selection(examples, y,
                                                    existing_text_features,
                                                    cv_splits,
                                                    max_ngrams=n)
                scores.append(score)
                logger.debug("Evaluating usage of {} ngrams. "
                             "Score: {}".format(n, score))

            n_top = num_ngrams[np.argmax(scores)]
            logger.info("Best score with {} ngrams: "
                        "{}".format(n_top, np.max(scores)))
            return n_top
        else:
            warnings.warn("Can't cross-validate ngram featurizer. "
                          "There aren't enough examples per intent "
                          "(at least 3)")
            return max_ngrams