python source code of SCDV

from gensim.models import Word2Vec, FastText
import pandas as pd
import time
import numpy as np
from KaggleWord2VecUtility import KaggleWord2VecUtility
import sys
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV


def drange(start, stop, step):
    r = start
    while r < stop:
        yield r
        r += step


def cluster_GMM(num_clusters, word_vectors):
    # Initalize a GMM object and use it for clustering.
    clf = GaussianMixture(n_components=num_clusters,
                          covariance_type="tied", init_params='kmeans', max_iter=50)
    # Get cluster assignments.
    clf.fit(word_vectors)
    idx = clf.predict(word_vectors)
    print("Clustering Done...", time.time() - start, "seconds")
    # Get probabilities of cluster assignments.
    idx_proba = clf.predict_proba(word_vectors)
    # Dump cluster assignments and probability of cluster assignments. 
    joblib.dump(idx, 'gmm_latestclusmodel_len2alldata.pkl')
    print("Cluster Assignments Saved...")

    joblib.dump(idx_proba, 'gmm_prob_latestclusmodel_len2alldata.pkl')
    print("Probabilities of Cluster Assignments Saved...")
    return (idx, idx_proba)


def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments. 
    idx = joblib.load(idx_name)
    idx_proba = joblib.load(idx_proba_name)
    print("Cluster Model Loaded...")
    return (idx, idx_proba)


def get_probability_word_vectors(featurenames, word_centroid_map, num_clusters, word_idf_dict):
    # This function computes probability word-cluster vectors.

    prob_wordvecs = {}

    for word in word_centroid_map:
        prob_wordvecs[word] = np.zeros(num_clusters * num_features, dtype="float32")
        for index in range(0, num_clusters):
            try:
                prob_wordvecs[word][index * num_features:(index + 1) * num_features] = model[word] * \
                                                                                       word_centroid_prob_map[word][
                                                                                           index] * word_idf_dict[word]
            except:
                continue

    # prob_wordvecs_idf_len2alldata = {}
    # i = 0
    # for word in featurenames:
    # 	i += 1
    # 	if word in word_centroid_map:	
    # 		prob_wordvecs_idf_len2alldata[word] = {}
    # 		for index in range(0, num_clusters):
    # 				prob_wordvecs_idf_len2alldata[word][index] = model[word] * word_centroid_prob_map[word][index] * word_idf_dict[word] 

    # for word in prob_wordvecs_idf_len2alldata.keys():
    # 	prob_wordvecs[word] = prob_wordvecs_idf_len2alldata[word][0]
    # 	for index in prob_wordvecs_idf_len2alldata[word].keys():
    # 		if index==0:
    # 			continue
    # 		prob_wordvecs[word] = np.concatenate((prob_wordvecs[word], prob_wordvecs_idf_len2alldata[word][index]))
    return prob_wordvecs


def create_cluster_vector_and_gwbowv(prob_wordvecs, wordlist, word_centroid_map, word_centroid_prob_map, dimension,
                                     word_idf_dict, featurenames, num_centroids, train=False):
    # This function computes SDV feature vectors.
    bag_of_centroids = np.zeros(num_centroids * dimension, dtype="float32")
    global min_no
    global max_no

    for word in wordlist:
        try:
            temp = word_centroid_map[word]
        except:
            continue

        bag_of_centroids += prob_wordvecs[word]

    norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
    if (norm != 0):
        bag_of_centroids /= norm

    # To make feature vector sparse, make note of minimum and maximum values.
    if train:
        min_no += min(bag_of_centroids)
        max_no += max(bag_of_centroids)

    return bag_of_centroids


if __name__ == '__main__':

    start = time.time()

    num_features = int(sys.argv[1])  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
    num_workers = 40  # Number of threads to run in parallel
    context = 10  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    model_type = sys.argv[3]

    model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(
        context) + "context_len2alldata"

    # Load train data.
    train = pd.read_csv('data/train_v2.tsv', header=0, delimiter="\t")
    # Load test data.
    test = pd.read_csv('data/test_v2.tsv', header=0, delimiter="\t")
    all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")

    assert model_type in ["word2vec", "fasttext"]

    if model_type == "word2vec":
        # Load the trained Word2Vec model.
        model = Word2Vec.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word
    elif model_type == "fasttext":
        # Load the trained FastText model.
        model = FastText.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word

    # Set number of clusters.
    num_clusters = int(sys.argv[2])
    # Uncomment below line for creating new clusters.
    idx, idx_proba = cluster_GMM(num_clusters, word_vectors)

    # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
    # idx_name = "gmm_latestclusmodel_len2alldata.pkl"
    # idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
    # idx, idx_proba = read_GMM(idx_name, idx_proba_name)

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip(index2word, idx))
    # Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to
    # list of probabilities of cluster assignments.
    word_centroid_prob_map = dict(zip(index2word, idx_proba))

    # Computing tf-idf values.
    traindata = []
    for i in range(0, len(all["news"])):
        traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["news"][i], True)))

    tfv = TfidfVectorizer(strip_accents='unicode', dtype=np.float32)
    tfidfmatrix_traindata = tfv.fit_transform(traindata)
    featurenames = tfv.get_feature_names()
    idf = tfv._tfidf.idf_

    # Creating a dictionary with word mapped to its idf value 
    print("Creating word-idf dictionary for Training set...")

    word_idf_dict = {}
    for pair in zip(featurenames, idf):
        word_idf_dict[pair[0]] = pair[1]

    # Pre-computing probability word-cluster vectors.
    prob_wordvecs = get_probability_word_vectors(featurenames, word_centroid_map, num_clusters, word_idf_dict)

    temp_time = time.time() - start
    print("Creating Document Vectors...:", temp_time, "seconds.")

    # gwbowv is a matrix which contains normalised document vectors.
    gwbowv = np.zeros((train["news"].size, num_clusters * (num_features)), dtype="float32")

    counter = 0

    min_no = 0
    max_no = 0
    for review in train["news"]:
        # Get the wordlist in each news article.
        words = KaggleWord2VecUtility.review_to_wordlist(review, \
                                                         remove_stopwords=True)
        gwbowv[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, word_centroid_map,
                                                           word_centroid_prob_map, num_features, word_idf_dict,
                                                           featurenames, num_clusters, train=True)
        counter += 1
        if counter % 1000 == 0:
            print("Train News Covered : ", counter)

    gwbowv_name = "SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"

    gwbowv_test = np.zeros((test["news"].size, num_clusters * (num_features)), dtype="float32")

    counter = 0

    for review in test["news"]:
        # Get the wordlist in each news article.
        words = KaggleWord2VecUtility.review_to_wordlist(review, \
                                                         remove_stopwords=True)
        gwbowv_test[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, word_centroid_map,
                                                                word_centroid_prob_map, num_features, word_idf_dict,
                                                                featurenames, num_clusters)
        counter += 1
        if counter % 1000 == 0:
            print("Test News Covered : ", counter)

    test_gwbowv_name = "TEST_SDV_" + str(num_clusters) + "cluster_" + str(
        num_features) + "feature_matrix_gmm_sparse.npy"

    print("Making sparse...")
    # Set the threshold percentage for making it sparse. 
    percentage = 0.04
    min_no = min_no * 1.0 / len(train["news"])
    max_no = max_no * 1.0 / len(train["news"])
    print("Average min: ", min_no)
    print("Average max: ", max_no)
    thres = (abs(max_no) + abs(min_no)) / 2
    thres = thres * percentage

    # Make values of matrices which are less than threshold to zero.
    temp = abs(gwbowv) < thres
    gwbowv[temp] = 0

    temp = abs(gwbowv_test) < thres
    gwbowv_test[temp] = 0

    # saving gwbowv train and test matrices
    np.save(gwbowv_name, gwbowv)
    np.save(test_gwbowv_name, gwbowv_test)

    endtime = time.time() - start
    print("SDV created and dumped: ", endtime, "seconds.")
    print("Fitting a SVM classifier on labeled training data...")

    param_grid = [
        {'C': np.arange(0.1, 7, 0.2)}]
    scores = ['accuracy', 'recall_micro', 'f1_micro', 'precision_micro', 'recall_macro', 'f1_macro', 'precision_macro',
              'recall_weighted', 'f1_weighted', 'precision_weighted']  # , 'accuracy', 'recall', 'f1']
    for score in scores:
        strt = time.time()
        print("# Tuning hyper-parameters for", score, "\n")
        clf = GridSearchCV(LinearSVC(C=1), param_grid, cv=5, scoring='%s' % score)
        clf.fit(gwbowv, train["class"])
        print("Best parameters set found on development set:\n")
        print(clf.best_params_)
        print("Best value for ", score, ":\n")
        print(clf.best_score_)
        Y_true, Y_pred = test["class"], clf.predict(gwbowv_test)
        print("Report")
        print(classification_report(Y_true, Y_pred, digits=6))
        print("Accuracy: ", clf.score(gwbowv_test, test["class"]))
        print("Time taken:", time.time() - strt, "\n")
    endtime = time.time()
    print("Total time taken: ", endtime - start, "seconds.")

    print("********************************************************")