python source code of test

from __future__ import unicode_literals, division, print_function, absolute_import
from builtins import object, range
from glob import glob
import pickle as pkl
import logging
from copy import deepcopy
import numpy as np

from conec import word2vec
from conec import context2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


class Text8Corpus(object):
    """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip ."""

    def __init__(self, fname):
        self.fname = fname

    def __iter__(self):
        # the entire corpus is one gigantic line -- there are no sentence marks at all
        # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
        sentence, rest, max_sentence_length = [], '', 1000
        with open(self.fname) as fin:
            while True:
                text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
                if text == rest:  # EOF
                    sentence.extend(rest.split())  # return the last chunk of words, too (may be shorter/longer)
                    if sentence:
                        yield sentence
                    break
                # the last token may have been split in two... keep it for the next iteration
                last_token = text.rfind(' ')
                words, rest = (text[:last_token].split(), text[last_token:].strip()) if last_token >= 0 else ([], text)
                sentence.extend(words)
                while len(sentence) >= max_sentence_length:
                    yield sentence[:max_sentence_length]
                    sentence = sentence[max_sentence_length:]


class OneBilCorpus(object):
    """Iterate over sentences from the "1-billion-word-language-modeling-benchmark" corpus,
    downloaded from http://code.google.com/p/1-billion-word-language-modeling-benchmark/ ."""

    def __init__(self):
        self.dir = 'data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/news*'

    def __iter__(self):
        # go file by file
        for fname in glob(self.dir):
            with open(fname) as f:
                yield f.read().lower().split()


def analogy(model, a, b, c):
    # man:woman as king:x - a:b as c:x - find x
    # get embeddings for a, b, and c and multiply with all other words
    a_sims = 1. + np.dot(model.wv.vectors_norm, model.wv.vectors_norm[model.wv.vocab[a].index])
    b_sims = 1. + np.dot(model.wv.vectors_norm, model.wv.vectors_norm[model.wv.vocab[b].index])
    c_sims = 1. + np.dot(model.wv.vectors_norm, model.wv.vectors_norm[model.wv.vocab[c].index])
    # add/multiply them as they should
    return b_sims - a_sims + c_sims
    # return (b_sims*c_sims)/a_sims


def accuracy(model, questions, lowercase=True, restrict_vocab=30000):
    """
    Compute accuracy of the model. `questions` is a filename where lines are
    4-tuples of words, split into sections by ": SECTION NAME" lines.
    See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

    The accuracy is reported (=printed to log and returned as a list) for each
    section separately, plus there's one aggregate summary at the end.

    Use `restrict_vocab` to ignore all questions containing a word whose frequency
    is not in the top-N most frequent words (default top 30,000).

    This method corresponds to the `compute-accuracy` script of the original C word2vec.

    """
    ok_vocab = dict(sorted(model.wv.vocab.items(), key=lambda item: -item[1].count)[:restrict_vocab])
    ok_index = set(v.index for v in ok_vocab.values())

    def log_accuracy(section):
        correct, incorrect = section['correct'], section['incorrect']
        if correct + incorrect > 0:
            print("%s: %.1f%% (%i/%i)" % (section['section'],
                                          100.0 * correct / (correct + incorrect), correct, correct + incorrect))

    sections, section = [], None
    for line_no, line in enumerate(open(questions)):
        # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
        if line.startswith(': '):
            # a new section starts => store the old section
            if section:
                sections.append(section)
                log_accuracy(section)
            section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0}
        else:
            if not section:
                raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
            try:
                if lowercase:
                    a, b, c, expected = [word.lower() for word in line.split()]
                else:
                    a, b, c, expected = [word for word in line.split()]
            except:
                print("skipping invalid line #%i in %s" % (line_no, questions))
            if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                # print "skipping line #%i with OOV words: %s" % (line_no, line)
                continue

            ignore = set(model.wv.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
            predicted = None
            # find the most likely prediction, ignoring OOV words and input words
            # for index in np.argsort(model.wv.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]:
            for index in np.argsort(analogy(model, a, b, c))[::-1]:
                if index in ok_index and index not in ignore:
                    predicted = model.wv.index2word[index]
                    # if predicted != expected:
                    #     print "%s: expected %s, predicted %s" % (line.strip(), expected, predicted)
                    break
            section['correct' if predicted == expected else 'incorrect'] += 1
    if section:
        # store the last section, too
        sections.append(section)
        log_accuracy(section)

    total = {'section': 'total', 'correct': sum(s['correct']
                                                for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)}
    log_accuracy(total)
    sections.append(total)
    return sections


def accuracy_examples(model):
    # just as advertised...
    print(model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
    # "boy" is to "father" as "girl" is to ...?
    print(model.wv.most_similar(['girl', 'father'], ['boy'], topn=3))
    more_examples = ["he his she", "big bigger bad", "going went being"]
    for example in more_examples:
        a, b, x = example.split()
        predicted = model.wv.most_similar([x, b], [a])[0][0]
        print("'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted))
    # which word doesn't go with the others?
    print(model.wv.doesnt_match("breakfast cereal dinner lunch".split()))


def evaluate_google():
    # see https://code.google.com/archive/p/word2vec/
    # load pretrained google embeddings and test
    from gensim.models import Word2Vec
    model_google = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
    _ = accuracy(model_google, "data/questions-words.txt", False)


def evaluate_word2vec(corpus, seed=1):
    # load and evaluate
    fname = "%s_cbow_200_hs0_neg13_seed%i.model" % (corpus, seed)
    with open("data/%s" % fname, 'rb') as f:
        model = pkl.load(f)
    _ = accuracy(model, "data/questions-words.txt")


def evaluate_contextenc(corpus, seed=1):
    # load word2vec model
    print("####### seed = %i" % seed)
    fname = "%s_cbow_200_hs0_neg13_seed%i.model" % (corpus, seed)
    with open("data/%s" % fname, 'rb') as f:
        model_org = pkl.load(f)
    # get context matrix
    if corpus == 'text8':
        sentences = Text8Corpus('data/text8')
    elif corpus == '1bil':
        sentences = OneBilCorpus()
    context_model = context2vec.ContextModel(
        sentences, min_count=model_org.min_count, window=model_org.window, wordlist=model_org.wv.index2word)
    for fill_diag in [True, False]:
        model = deepcopy(model_org)
        # build context matrix
        print("constructing context matrix for fill_diag: %s" % (fill_diag))
        context_mat = context_model.get_context_matrix(fill_diag, False)
        # adapt the word2vec model
        print("adapting the word2vec weights - vectors_norm")
        model.wv.vectors_norm = context_mat.dot(model.wv.vectors_norm)
        # renormalize
        model.wv.vectors_norm = model.wv.vectors_norm / np.array([np.linalg.norm(model.wv.vectors_norm, axis=1)]).T
        # evaluate
        print("evaluating the model")
        _ = accuracy(model, "data/questions-words.txt")


def train_word2vec(corpus='text8', seed=1, it=10, save_interm=True):
    # load text
    if corpus == 'text8':
        sentences = Text8Corpus('data/text8')
    elif corpus == '1bil':
        sentences = OneBilCorpus()

    def save_model(model, saven):
        # delete the huge stupid table again
        table = deepcopy(model.table)
        model.table = None
        # pickle the entire model to disk, so we can load&resume training later
        pkl.dump(model, open("data/%s" % saven, 'wb'), -1)
        # reinstate the table to continue training
        model.table = table

    # train the cbow model; default window=5
    model = word2vec.Word2Vec(sentences, mtype='cbow', hs=0, neg=13, vector_size=200, alpha=0.025, min_alpha=0.01, seed=seed)
    for i in range(1, it):
        print("####### ITERATION %i ########" % i)
        _ = accuracy(model, "data/questions-words.txt")
        if save_interm:
            save_model(model, "%s_cbow_200_hs0_neg13_seed%i_it%i.model" % (corpus, seed, i))
        model.train(sentences, alpha=0.025, min_alpha=0.01)
    save_model(model, "%s_cbow_200_hs0_neg13_seed%i_it%i.model" % (corpus, seed, it))
    print("####### ITERATION %i ########" % it)
    _ = accuracy(model, "data/questions-words.txt")
    accuracy_examples(model)


def main():
    # load the text on which we're training
    sentences = Text8Corpus('data/text8')
    # this would train the model for 1 iteration
    # model = word2vec.Word2Vec(sentences, mtype='cbow', hs=0, neg=13, vector_size=200, seed=3)
    # and we don't need the table used for negative sampling (it's huge)
    # model.table = None
    # however to replicate the results of the paper, you should train the model for 10 iterations
    # we set `it' to 3 here to speed up the process, change it to 10 for better accuracies
    it = 3
    train_word2vec(corpus='text8', seed=3, it=it, save_interm=False)
    # since this saves the model (e.g. for training on a cluster), we need to load it again
    with open("data/text8_cbow_200_hs0_neg13_seed3_it%i.model" % it, "rb") as f:
        model = pkl.load(f)
    """
        collected 253854 unique words from a corpus of 17005207 words and 17006 sentences
        total of 71290 unique words after removing those with count < 5
        training model on 71290 vocabulary and 200 features
        training on 16718844 words took 2789.4s, 5994 words/s
    """
    # evaluate the accuracy on the analogy task (the results below are after 3 iterations)
    _ = accuracy(model, "data/questions-words.txt")
    """
        capital-common-countries: 35.8% (181/506)
        capital-world: 15.8% (230/1452)
        currency: 10.4% (28/268)
        city-in-state: 19.1% (300/1571)
        family: 73.2% (224/306)
        gram1-adjective-to-adverb: 11.2% (85/756)
        gram2-opposite: 19.3% (59/306)
        gram3-comparative: 57.0% (718/1260)
        gram4-superlative: 33.6% (170/506)
        gram5-present-participle: 24.2% (240/992)
        gram6-nationality-adjective: 62.8% (861/1371)
        gram7-past-tense: 27.5% (366/1332)
        gram8-plural: 41.3% (410/992)
        gram9-plural-verbs: 39.4% (256/650)
        total: 33.6% (4128/12268)
    """
    # get the global context matrix relying on the same text
    context_model = context2vec.ContextModel(sentences, min_count=model.min_count,
                                             window=model.window, wordlist=model.wv.index2word)
    # best results on the analogy task when counting the target word in addition to the context words
    # --> fill diagonal of the context matrix. normalization is irrelevant since we renormalize later
    context_mat = context_model.get_context_matrix(fill_diag=True, norm=False)
    # adapt the word embeddings of the word2vec model by multiplying them with the context matrix
    model.wv.vectors_norm = context_mat.dot(model.wv.vectors_norm)
    # renormalize so the word embeddings have unit length again
    model.wv.vectors_norm = model.wv.vectors_norm / np.array([np.linalg.norm(model.wv.vectors_norm, axis=1)]).T
    # evaluate the model again
    _ = accuracy(model, "data/questions-words.txt")
    """
        capital-common-countries: 62.3% (315/506)
        capital-world: 34.9% (507/1452)
        currency: 15.3% (41/268)
        city-in-state: 29.2% (458/1571)
        family: 72.5% (222/306)
        gram1-adjective-to-adverb: 14.0% (106/756)
        gram2-opposite: 19.9% (61/306)
        gram3-comparative: 54.2% (683/1260)
        gram4-superlative: 32.8% (166/506)
        gram5-present-participle: 26.7% (265/992)
        gram6-nationality-adjective: 56.1% (769/1371)
        gram7-past-tense: 25.5% (340/1332)
        gram8-plural: 37.1% (368/992)
        gram9-plural-verbs: 24.8% (161/650)
        total: 36.4% (4462/12268)
    """


if __name__ == '__main__':
    main()