python source code of wordtwovec

"""
a wrapper class for the gensim Word2Vec model that has extra features we need, as well as some
helper functions for tokenizing and stemming and things like that.
"""

from functools import lru_cache
import math
from typing import Iterable, List

from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.porter import PorterStemmer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

import numpy as np

stemmer = PorterStemmer()


@lru_cache(maxsize=1024)
def stem(word: str) -> str:
    """stemming words is not cheap, so use a cache decorator"""
    return stemmer.stem(word)


def tokenizer(sentence: str) -> List[str]:
    """use gensim's `simple_preprocess` and `STOPWORDS` list"""
    return [stem(token) for token in simple_preprocess(sentence) if token not in STOPWORDS]


def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    """https://en.wikipedia.org/wiki/Cosine_similarity"""
    num = np.dot(v1, v2)
    d1 = np.dot(v1, v1)
    d2 = np.dot(v2, v2)

    if d1 > 0.0 and d2 > 0.0:
        return num / math.sqrt(d1 * d2)
    else:
        return 0.0


class WordTwoVec:
    """
    a wrapper for gensim.Word2Vec with added functionality to embed phrases and compute the
    "goodness" of a question-answer pair based on embedding-vector similarity
    """
    def __init__(self, model_file: str) -> None:
        if model_file.endswith(".bin"):
            self.model = Word2Vec.load_word2vec_format(model_file, binary=True)
        else:
            self.model = Word2Vec.load(model_file)

    def embed(self, words: Iterable[str]) -> np.ndarray:
        """given a list of words, find their vector embeddings and return the vector mean"""
        # first find the vector embedding for each word
        vectors = [self.model[word] for word in words if word in self.model]

        if vectors:
            # if there are vector embeddings, take the vector average
            return np.average(vectors, axis=0)
        else:
            # otherwise just return a zero vector
            return np.zeros(self.model.vector_size)

    def goodness(self, question_stem: str, choice_text: str) -> float:
        """how good is the choice for this question?"""
        question_words = {word for word in tokenizer(question_stem)}
        choice_words = {word for word in tokenizer(choice_text) if word not in question_words}

        score = cosine_similarity(self.embed(question_words), self.embed(choice_words))

        if "Max is doing" in question_stem:
            print(choice_text, score)

        return score