python source code of keyword

"""Baseline response ranking methods using keyword matching."""

import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import (HashingVectorizer,
                                             TfidfTransformer,
                                             _document_frequency)
from sklearn.utils.testing import ignore_warnings

from baselines import method


class BM25Method(method.BaselineMethod):
    """BM25 baseline, using weighted keyword matching.

    Adapted from https://github.com/arosh/BM25Transformer/blob/master/bm25.py
    see Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html

    Args:
        k1: float, optional (default=2.0)
        b: float, optional (default=0.75)

    """
    def __init__(self, k1=2.0, b=0.75):
        """Create a new `BM25Method` object."""
        self._k1 = k1
        self._b = b

    def train(self, contexts, responses):
        """Fit the tf-idf transform and compute idf statistics."""
        with ignore_warnings():
            # Ignore deprecated `non_negative` warning.
            self._vectorizer = HashingVectorizer(non_negative=True)
        self._tfidf_transform = TfidfTransformer()
        count_matrix = self._tfidf_transform.fit_transform(
            self._vectorizer.transform(contexts + responses))
        n_samples, n_features = count_matrix.shape
        df = _document_frequency(count_matrix)
        idf = np.log((n_samples - df + 0.5) / (df + 0.5))
        self._idf_diag = sp.spdiags(
            idf, diags=0, m=n_features, n=n_features
        )
        document_lengths = count_matrix.sum(axis=1)
        self._average_document_length = np.mean(document_lengths)
        print(self._average_document_length)

    def _vectorize(self, strings):
        """Vectorize the given strings."""
        with ignore_warnings():
            # Ignore deprecated `non_negative` warning.
            tf_idf_vectors = self._tfidf_transform.transform(
                self._vectorizer.transform(strings))
        tf_idf_vectors = sp.csr_matrix(
            tf_idf_vectors, dtype=np.float64, copy=True)

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        document_lengths = tf_idf_vectors.sum(axis=1)

        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        num_terms = tf_idf_vectors.indptr[1:] - tf_idf_vectors.indptr[0:-1]

        # In each row, repeat `document_lengths` for `num_terms` times
        # Shape is (sum(num_terms), )
        rep = np.repeat(np.asarray(document_lengths), num_terms)

        # Compute BM25 score only for non-zero elements
        data = tf_idf_vectors.data * (self._k1 + 1) / (
            tf_idf_vectors.data + self._k1 * (
                1 - self._b + self._b * rep / self._average_document_length))

        vectors = sp.csr_matrix(
            (data, tf_idf_vectors.indices, tf_idf_vectors.indptr),
            shape=tf_idf_vectors.shape)
        vectors = vectors * self._idf_diag

        return vectors

    def rank_responses(self, contexts, responses):
        """Rank the responses for each context."""
        contexts_matrix = self._vectorize(contexts)
        responses_matrix = self._vectorize(responses)
        similarities = contexts_matrix.dot(responses_matrix.T).toarray()
        return np.argmax(similarities, axis=1)


class TfIdfMethod(method.BaselineMethod):
    """TF-IDF baseline.

    This hashes words to sparse IDs, and then computes tf-idf statistics for
    these hashed IDs. As a result, no words are considered out-of-vocabulary.
    """
    def train(self, contexts, responses):
        """Fit the tf-idf transform and compute idf statistics."""
        self._vectorizer = HashingVectorizer()
        self._tfidf_transform = TfidfTransformer()
        self._tfidf_transform.fit(
            self._vectorizer.transform(contexts + responses))

    def _vectorize(self, strings):
        """Vectorize the given strings."""
        tf_idf_vectors = self._tfidf_transform.transform(
            self._vectorizer.transform(strings))
        return sp.csr_matrix(
            tf_idf_vectors, dtype=np.float64, copy=True)

    def rank_responses(self, contexts, responses):
        """Rank the responses for each context."""
        contexts_matrix = self._vectorize(contexts)
        responses_matrix = self._vectorize(responses)
        similarities = contexts_matrix.dot(responses_matrix.T).toarray()
        return np.argmax(similarities, axis=1)