python source code of data

FILENAME = 'sequences_full.csv'
VOCAB_SIZE = None
UNK = 'UNK'

POS_TAGS = { 'CC' : '<CC>', 'CD' : '<CD>', 'DT' : '<DT>', 'EX' : '<EX>', 'FW' : '<FW>', 'IN' : '<IN>', 'JJ' : '<JJ>', 'JJR' : '<JJR>', 'JJS' : '<JJS>', 'LS' : '<LS>', 'MD' : '<MD>', 'NN' : '<NN>', 'NNS' : '<NNS>', 'NNP' : '<NNP>', 'NNPS' : '<NNPS>', 'PDT' : '<PDT>', 'POS' : '<POS>', 'PRP' : '<PRP>', 'PRP' : '<PRP>', 'RB' : '<RB>', 'RBR' : '<RBR>', 'RBS' : '<RBS>', 'RP' : '<RP>', 'SYM' : '<SYM>', 'TO' : '<TO>', 'UH' : '<UH>', 'VB' : '<VB>', 'VBD' : '<VBD>', 'VBG' : '<VBG>', 'VBN' : '<VBN>', 'VBP' : '<VBP>', 'VBZ' : '<VBZ>', 'WDT' : '<WDT>', 'WP' : '<WP>', 'WP$' : '<WP$>', 'WRB' : '<WRB>' }


# imports : in the order of usage
import itertools
import nltk

import random
import sys

import pickle


'''
 read lines from file
     return [list of lines]

'''
def read_lines(filename):
    return fix_win_encode(open(filename).read()).split('\n')[1:-1]

def fix_win_encode(text):
    return text.replace('\x92', "'").replace('\x97', ' ').replace('\x91', '').replace('_b_','').replace('*','').replace('\x93','')


'''
 split each row of form "query |respect| response"
  to [ query, response, respect ]

'''
def split_row(lines):
    q,r,respect = [], [], []
    for line in lines:
        line = line.split('|')
        r.append(split_and_tag(line[0]))
        q.append(split_and_tag(line[-1]))
        respect.append(int(line[1]))
    return q,r,respect


'''
 split sentences into words and tags with nltk
  replace foreign words and numbers 
   into <FW> and <CD> tags
    
'''
def split_and_tag(line):
    wtags = nltk.pos_tag(nltk.word_tokenize(line.strip()))
    words = []
    for w,t in wtags:
        if t == 'CD' or t == 'FW':
            w = t
        words.append(w)
    return words
    
    
'''
 read list of words, create index to word,
  word to index dictionaries
    return tuple( vocab->(word, count), idx2w, w2idx )

'''
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    vocab = [ item for item in vocab if item[1] > 1 ]
    # index2word
    index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist


'''
 There will be no zero padding!
 
'''
def encode(q, r, w2idx):
    # num of rows
    data_len = len(q)

    idx_q, idx_r = [], []

    for i in range(data_len):
        idx_q.append(encode_seq(q[i], w2idx))
        idx_r.append(encode_seq(r[i], w2idx))

    return idx_q, idx_r


'''
 replace words with indices in a sequence
  replace with unknown if word not in lookup
    return [list of indices]

'''
def encode_seq(seq, lookup):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            tag = nltk.pos_tag([word])[-1][-1]
            if tag in lookup:
                indices.append(lookup[tag])
            else:
                indices.append(lookup[UNK])
    return indices


def process_data():

    print('\n>> Read lines from file')
    lines = read_lines(filename=FILENAME)

    # change to lower case
    lines = [ line.lower() for line in lines ]

    print('>> [read_lines] {} lines;\nexamples\n{}'.
            format(len(lines), lines[121:125]))

    # split row into query, response and respect
    q, r, respect = split_row(lines)

    print('\n>> [split_row] \n{} {} {}'.
            format( q[121:125], r[121:125], respect[121:125]))

    #############
    # NL pipeline
    ####

    ##
    # [1] Spell Check
    #
    # [2] POS tagging

    # indexing -> idx2w, w2idx : en/ta
    print('\n >> Index words')
    idx2w, w2idx, freq_dist = index_(q+r, vocab_size=None)

    idx_q, idx_r = encode(q, r, w2idx)

    data = {
        'q' : idx_q, 
        'r' : idx_r, 
        'respect' : respect
            }

    # let us now save the necessary dictionaries
    metadata = {
            'w2idx' : w2idx,
            'idx2w' : idx2w,
            'freq_dist' : freq_dist,
            'respect_size' : max(respect) + 1
                }

    # write to disk : data control dictionaries
    with open('metadata.pkl', 'wb') as f:
        pickle.dump(metadata, f)

    with open('data.pkl', 'wb') as f:
        pickle.dump(data, f)



def load_data(PATH=''):
    # read data control dictionaries
    with open(PATH + 'metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    with open(PATH + 'data.pkl', 'rb') as f:
        data = pickle.load(f)

    return data, metadata


if __name__ == '__main__':
    process_data()