python source code of features_generation

import data_preparation_tools as dpt
import fnmatch
import gensim
import logging
import multiprocessing
import numpy as np
import sklearn.metrics as metrics
import re
from gensim.models.doc2vec import *
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from spacy.en import English

DEFAULT_BOW_NGRAM_RANGE = (1,1)
DEFAULT_BOW_MAX_FEATURES = None
DEFAULT_BOW_BINARY = True
ENTITY_REGEX = re.compile(r"GGVARENTTY[0-9]+GG|MMVARENTTY[0-9]+MM", re.IGNORECASE)

BINARY_LABELS_TO_CLASSES_TABLE = {
    'NO_RELATION' : 0,
    'RELATION' : 1
}

MULTICLASS_LABELS_TO_CLASSES_TABLE = {
    'NO_RELATION' : 0,
    'NEGATIVE_DIRECT' : 1,
    'NEGATIVE_INDIRECT' : 2,
    'POSITIVE_DIRECT' : 3,
    'POSITIVE_INDIRECT' : 4
}

class TrainTestData:
     def __init__(self, train_data, train_labels, test_data, test_labels, is_multiclass, feature_gen_model = None):
        self.train_data = train_data
        self.train_labels = train_labels
        self.test_data = test_data
        self.test_labels = test_labels
        self.is_multiclass = is_multiclass
        self.feature_gen_model = feature_gen_model

class EvaluationResult:
    def __init__(self, model, features_gen_model, test_data, scores):
        self.model = model
        self.test_data = test_data
        self.scores = scores
        self.features_gen_model = features_gen_model

# read texts and labels from data file:
def read_data_from_file(file_path):
    with open(file_path) as handle:
        labels = []
        data = []
        for l in handle:
            parts = l.rstrip().split("\t")
            if (len(parts) < 2):
                continue
            labels.append(parts[0])
            data.append(parts[1])
        return data,labels
    
def read_train_and_test_data_from_path(path):
    only_files = [f for f in listdir(path) if (isfile(join(path, f)))]
    train_files = [f for f in only_files if fnmatch.fnmatch(f, '*_train.tsv')]
    data_names = ["_".join(f.split("_")[:-1]) for f in train_files]
    data_table = {}
    data_table_no_entities = {}
    
    for name in data_names:
        train_data, train_labels = read_data_from_file(join(path, name + "_train.tsv"))
        test_data, test_labels = read_data_from_file(join(path, name + "_test.tsv"))
        
        is_multiclass = name.find('multiclass') > -1
        
        # without entities as well:
        train_data_no_entities, indices_to_remove = remove_entities_from_text(train_data)
        train_labels_no_entities = train_labels
        test_data_no_entities, indices_to_remove = remove_entities_from_text(test_data)
        test_labels_no_entities = test_labels
        
        data_table[name] = TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass)
        data_table_no_entities[name] = TrainTestData(train_data_no_entities, train_labels_no_entities,
                                                     test_data_no_entities, test_labels_no_entities, is_multiclass)
    
    return data_table, data_table_no_entities

def remove_entities_from_text(sentences):
    fixed_sentences = []
    indices_to_remove = []
    for s,i in zip(sentences,range(len(sentences))):
        new_sent = []
        for t in s.split():
            if (not ENTITY_REGEX.match(t)):
                new_sent.append(t)
        if (len(new_sent) == 0):
            indices_to_remove.append(i)
        #else:
        fixed_sentences.append(" ".join(new_sent))
        
    return fixed_sentences, indices_to_remove

nlp_parser = None
def to_nlp_objs(sentences):
    global nlp_parser
    # init once
    if (nlp_parser == None):
        nlp_parser = English()

    nlp_objs = []
    for s in sentences:
        nlp_objs.append(nlp_parser(s.decode('unicode-escape'), entity=False))
    return nlp_objs

def get_nlp_features(sentences):
    parsed = to_nlp_objs(sentences)
    pos_tags = []
    for p in parsed:
        pos_tags.append([s.pos_ for s in p])

    return pos_tags

def to_pos_bow(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, binary=DEFAULT_BOW_BINARY):
    #TODO: can do this more efficiently, this is a workaround for now
    pos_tags_train = [" ".join(s) for s in get_nlp_features(train_samples)]
    pos_tags_test = [" ".join(s) for s in get_nlp_features(test_samples)]
    return to_bag_of_words(pos_tags_train, pos_tags_test, ngram_range=ngram_range, binary=binary, max_features=None)

def to_bag_of_words(train_samples, test_samples, ngram_range=DEFAULT_BOW_NGRAM_RANGE, 
                      max_features=DEFAULT_BOW_MAX_FEATURES, binary=DEFAULT_BOW_BINARY):
        #Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
        vectorizer = CountVectorizer(analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,
                                     stop_words = None,
                                     max_features = max_features,
                                     binary = binary,
                                     ngram_range=ngram_range)

        train_data_features = vectorizer.fit_transform(train_samples)
        test_data_features = vectorizer.transform(test_samples)
        return train_data_features, test_data_features, vectorizer
    
def get_bow_features(train_samples, test_samples, ngram_range):
    return to_bag_of_words(train_samples, test_samples, ngram_range=ngram_range)

def get_bow_and_pos_features(train_samples, test_samples, ngram_range, pos_ngram_range):
    bow_train_features, bow_test_features = get_bow_features(train_samples, test_samples, ngram_range)
    pos_train_features, pos_test_features = to_pos_bow(train_samples, test_samples, ngram_range=pos_ngram_range)

    
    train_features = hstack((bow_train_features, pos_train_features))
    test_features = hstack((bow_test_features, pos_test_features))

    return train_features, test_features

def get_compound_features(train_data, test_data, feature_gen_methods):
    train_features_list = []
    test_features_list = []

    for m in feature_gen_methods:
        train_features, test_features = m(train_data, test_data)
        train_features_list.append(train_features)
        test_features_list.append(test_features)

    train_features = train_features_list[0]
    test_features = test_features_list[0]

    for i in xrange(1,len(feature_gen_methods)):
        train_features = hstack((train_features, train_features_list[i]))
        test_features = hstack((test_features, test_features_list[i]))

    return train_features, test_features
  
def merge_into_file(input_path_or_data, output):
    if (input_path_or_data == None):
        return

    # if it's data and not path
    if (dpt.is_sequence(input_path_or_data)):
        for l in input_path_or_data:
            output.write(l + "\n")
        return len(input_path_or_data)

    count = 0;
    with open(input_path_or_data) as input:
        for l in input:
            output.write(l)
            count = count + 1
        return count
    
def build_doc2vec_model(data, temp_doc2vec_input_file_path, background_samples_file_path = None,
                      model_file_path = None, should_log = False):

    if (should_log):
        reload(logging)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        logger = logging.getLogger()

    # merge the data into one file and then let gensim TaggedLineDocument class take care of the rest
    # this can be further optimized by creating a custom doc2vec iterator that will read the files in sequence
    print "creating temp file..."
    with open(temp_doc2vec_input_file_path,"w") as output:
        merge_into_file(data, output)
        merge_into_file(background_samples_file_path, output)
    
    with open(temp_doc2vec_input_file_path) as handle:
        print "creating model..."
        # TODO: add min_count = 5, but deal with empty sentences..
        ncpus = multiprocessing.cpu_count()
        model = Doc2Vec(TaggedLineDocument(handle), size = 200, window=8, min_count = 5, workers = ncpus)
        print "model built"
        if (model_file_path != None):
            model.save(model_file_path)

    #return model
    return model

# get the doc2vec feature vectors
def get_doc2vec_features(train_data, test_data,
                         temp_doc2vec_input_file_path, background_samples_file_path = None):

    input_data = train_data + test_data
    model = build_doc2vec_model(input_data, temp_doc2vec_input_file_path, background_samples_file_path, should_log = True)

    # extract the vectors according to their class
    train_embeddings = [model.docvecs[index] for index in xrange(len(train_data))]
    test_embeddings = [model.docvecs[index] for index in xrange(len(train_data), len(train_data) + len(test_data))]
    #background_embeddings = [model.docvecs[index] for index in xrange(len(train_data) + len(test_data), model.docvecs.count)]
    
    return train_embeddings, test_embeddings, model

def label_to_class(label, is_multiclass, auto_add_classes=False):
    if (is_multiclass==True):
        if (not MULTICLASS_LABELS_TO_CLASSES_TABLE.has_key(label) and auto_add_classes):
            max_class = max([MULTICLASS_LABELS_TO_CLASSES_TABLE[k] for k in MULTICLASS_LABELS_TO_CLASSES_TABLE])
            MULTICLASS_LABELS_TO_CLASSES_TABLE[label] = max_class + 1
        
        return MULTICLASS_LABELS_TO_CLASSES_TABLE[label]
    
    return BINARY_LABELS_TO_CLASSES_TABLE[label]

def labels_to_classes(labels, is_multiclass=False):
    classes = []
    for label in labels:
        classes.append(label_to_class(label, is_multiclass))
    return classes
    
    
def gen_features_and_classes(train_test_data, gen_features_func):
    train_classes = labels_to_classes(train_test_data.train_labels, is_multiclass = train_test_data.is_multiclass)
    test_classes = labels_to_classes(train_test_data.test_labels, is_multiclass = train_test_data.is_multiclass)
    
    train_features, test_features, model = gen_features_func(train_test_data.train_data, train_test_data.test_data)
    
    return TrainTestData(train_features, train_classes, test_features, test_classes, train_test_data.is_multiclass, model)

def write_features_classes_to_file(file_path, data, labels):
    with open(file_path, "w") as handle:
        for d,l in zip(data, labels):
            line_text = str(l) + "," + ",".join([str (x) for x in d.toarray()[0]])
            handle.write(line_text + "\n")
            
# feature evaluation
def read_data_labels(file_path):
    data = []
    labels = []
    with open(file_path) as handle:
        for l in handle:
            parts = l.rstrip().split(",")
            labels.append(float(parts[0]))
            data.append([float(i) for i in parts[1:]])
            
    return data, labels

def read_train_test_data(input_dir, name):
    train_file_path = join(input_dir, name + "_train.csv")
    test_file_path = join(input_dir, name + "_test.csv")
    train_data, train_labels = read_data_labels(train_file_path)
    test_data, test_labels = read_data_labels(test_file_path)
    
    is_multiclass = name.find("multiclass") > -1
    
    return TrainTestData(train_data, train_labels, test_data, test_labels, is_multiclass)

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

def evaluate_model(train_test_data, model_initializer):
    clf = model_initializer()
    clf = clf.fit(train_test_data.train_data, train_test_data.train_labels)
    
    labels_predicted = clf.predict(train_test_data.test_data)
    scores_predicted = clf.predict_proba(train_test_data.test_data)
    print metrics.classification_report(train_test_data.test_labels, labels_predicted)
    return EvaluationResult(clf, train_test_data.feature_gen_model, train_test_data.test_data, scores_predicted)

class GenFeaturesMethod:
    def __init__(self, name, func, no_entities = False):
        self.name = name
        self.func = func
        self.no_entities = no_entities

class EvaluationMethod:
    def __init__(self, name, func):
        self.name = name
        self.func = func

# get path to the data input dir, and a list of GenFeaturesMethod objects
def run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods):
    data_dict, data_dict_no_entities = read_train_and_test_data_from_path(input_dir)
    results = []
    for name in data_dict:
        for gfm in gen_features_methods:
            print "generating %s features for %s"%(gfm.name, name)
            if (gfm.no_entities):
                data = data_dict_no_entities[name]
            else:
                data = data_dict[name]
                        
            train_test_data = gen_features_and_classes(data, gfm.func)
            
            for em in evaluation_methods:
                print "model evaluation for: %s, %s, %s"%(name, gfm.name, em.name)
                result = evaluate_model(train_test_data, em.func)
                results.append(result)
    return results