python source code of synthetic

from socialsent import constants
from socialsent import util
import polarity_induction_methods
import time
from socialsent import seeds
from socialsent import vocab
import random
import numpy as np
from socialsent import evaluate_methods
from Queue import Empty
from multiprocessing import Process, Queue
from socialsent.representations.representation_factory import create_representation
from socialsent.representations.embedding import Embedding
from sklearn.utils.extmath import randomized_svd
#from scipy.sparse import csr_matrix, vstack
from numpy import vstack
from scipy.stats import logistic

SYNTH_FREQ = 5*10**-5.0

#NEW_POS = ["cheerful", "beautiful", "charming", "pleasant", "sweet", "favourable", "cheery"]
NEW_POS = ["cheerful", "beautiful", "charming", "merry", "pleasing"]
NEW_NEG = ["hideous", "terrible", "dreadful", "worst", "awful"]
#NEW_NEG = ["disgusting", "hideous", "terrible", "unhappy", "nasty", "repulsive", "offensive"]
OLD_POS = NEW_POS
OLD_NEG = NEW_NEG

YEARS = range(1850, 1991, 10)

"""
Runs synthetic test of amelioration and pejoration.
"""

def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print year, len(words)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print year,  len(words)
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print year, weight
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')

def _make_weight(year):
    scaled = 2*(year-YEARS[0]) / (YEARS[-1] - YEARS[0]) - 1
    scaled *= -4
    return logistic.cdf(scaled)

def make_synthetic_data(ppmi, counts, word_subset, new_weight, num_synth=10, 
        old_pos=OLD_POS, new_pos=NEW_POS, old_neg=OLD_NEG, new_neg=NEW_NEG, dim=300, seed_offset=0):
    #print new_weight
    #ppmi = ppmi.get_subembed(word_subset, restrict_context=False)
    amel_vecs = [] 
    print "Sampling positive..."
    for i in xrange(num_synth):
        amel_vecs.append(_sample_vec2(new_pos, old_neg, counts, new_weight, seed=i+seed_offset))
    amel_mat = vstack(amel_vecs)
    pejor_vecs = []
    print "Sampling negative..."
    for i in xrange(num_synth):
        pejor_vecs.append(_sample_vec2(old_pos, new_neg, counts, 1-new_weight, seed=i+num_synth+seed_offset))
    pejor_mat = vstack(pejor_vecs)
    print "Making matrix..."
#    ppmi_mat = vstack([ppmi.m, amel_mat, pejor_mat]) 
    u = vstack([counts.m, amel_mat, pejor_mat]) 
    print "SVD on matrix..."
#    u, s, v = randomized_svd(ppmi_mat, n_components=dim, n_iter=2)
    new_vocab = ppmi.iw
    new_vocab.extend(['a-{0:d}'.format(i) for i in range(num_synth)])
    new_vocab.extend(['p-{0:d}'.format(i) for i in range(num_synth)])
    return Embedding(u, new_vocab)

def _sample_vec2(pos_words, neg_words, counts, pos_weight, seed=1):
    vec = np.zeros((counts.m.shape[1],))
    np.random.seed(seed)
    pos_weights = np.random.dirichlet(np.repeat(0.1, len(pos_words)))
    pos_weights = pos_weights / np.sum(pos_weights) 
    print pos_weights
    for i, word in enumerate(pos_words): 
        sample_vec = pos_weights[i] * pos_weight * counts.represent(word)
        vec += sample_vec
    neg_weights = np.random.dirichlet(np.repeat(0.1, len(pos_words)))
    neg_weights = neg_weights / np.sum(neg_weights) 
    for i, word in enumerate(neg_words): 
        sample_vec = neg_weights[i] * (1-pos_weight) * counts.represent(word)
        vec += sample_vec
    return vec / np.linalg.norm(vec)


def _sample_vec(pos_words, neg_words, counts, pos_weight, seed):
    sample_size = counts.m.sum() * SYNTH_FREQ / len(neg_words)
    vec = np.zeros((counts.m.shape[1],))
    np.random.seed(seed)
    pos_weights = np.random.uniform(size=len(pos_words))
    pos_weights = pos_weights / np.sum(pos_weights) 
    print pos_weights
    for i, word in enumerate(pos_words): 
        sample_vec = counts.represent(word)
        sample_vec /= float(sample_vec.sum())
        sample_vec = pos_weights[i] * pos_weight * np.random.multinomial(sample_size, sample_vec.todense().A[0])
        sample_vec = np.clip(sample_vec, 0, sample_size)
        if not np.isfinite(sample_vec.sum()):
            print "Infinite sample with", word
            continue
        vec += sample_vec
    neg_weights = np.random.uniform(size=len(neg_words))
    neg_weights = neg_weights / np.sum(neg_weights) 
    for i, word in enumerate(neg_words): 
        sample_vec = counts.represent(word)
        sample_vec /= float(sample_vec.sum())
        sample_vec = neg_weights[i] * (1-pos_weight) * np.random.multinomial(sample_size, sample_vec.todense().A[0])
        sample_vec = np.clip(sample_vec, 0, sample_size)
        if not np.isfinite(sample_vec.sum()):
            print "Infinite sample with", word
            continue
        vec += sample_vec
    vec = csr_matrix(vec)
    new_mat = vstack([counts.m, vec])
    new_mat = new_mat / new_mat.sum()
    synth_prob = new_mat[-1,:].sum()
    for neigh in vec.nonzero()[1]:
        val = max(np.log(new_mat[-1,neigh] 
                / (synth_prob * new_mat[neigh,:].sum() ** 0.75)),
                0)
        if np.isfinite(val):
            vec[0, neigh] = val
    return vec / np.sqrt((vec.multiply(vec).sum()))
         
def main(iter):
    num_procs = 20
    queue = Queue()
    for year in YEARS:
        queue.put(year)
    procs = [Process(target=worker, args=[i, queue, iter]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == "__main__":
    for iter in range(0,50):
        main(iter)