import numpy as np import scipy as sc from prettyprint import pp import os import re #index label in the dictionary idx_lbl = 'idx' dfreq_lbl = "docfreq" pattern = re.compile(r'([a-zA-Z]+|[0-9]+(\.[0-9]+)?)') def tokenizeDoc(doc_address, min_len = 0, remove_numerics=True): """ to tokenize a document file to alphabetic tokens use this function. Parameters ---------- doc_address: str path to the file that is going to be tokenized min_len: int minimum length of a token. Default value is zero, it should always be non-negative. remove_numerics: boolean whether to remove the numeric tokens or not Returns ------- tokens: list list of tokens from the input document according to the filtering criteria specified """ from string import punctuation, digits tokens = [] try: f = open(doc_address) raw = f.read().lower() text = pattern.sub(r' \1 ', raw.replace('\n', ' ')) text_translated = '' if remove_numerics: text_translated = text.translate(None, punctuation + digits) else: text_translated = text.translate(None, punctuation) tokens = [word for word in text_translated.split(' ') if (word and len(word) > min_len)] f.close() except: print "Error: %s couldn't be opened!", doc_address finally: return tokens def createDictionary(classes, tokens_pool): """ this method will create a dictionary out of the tokens_pool it has been provided. Parameters ---------- classes: list list of the names of the classes of documents tokens_pool: dictionary dictionary of tokens. Each value of the dictionary is an list of lists, each list belonging to a document in the corresponding class that has a list of tokens Returns ------- token_dict: dictionary *Note that the tokens in the dictionary are not sorted, since in the vector space model that we are going to use, all words are treated equal. We practically believe in justice. Words in dictionary are tired of all this injustice they have been forced to take for such a long time. Now is the time to rise and earn the justice that belongs to them. """ token_dict = {} idx = 0 #a unique index for words in dictionary for cl in classes: for tokens_list in tokens_pool[cl]: for token in tokens_list: if token in token_dict: #if token has been added to the dictionary before if cl in token_dict[token]: token_dict[token][cl] += 1 else: token_dict[token][cl] = 1 else: token_dict[token] = {} token_dict[token][idx_lbl] = idx idx += 1 token_dict[token][cl] = 1 return token_dict def createTokenPool(classes, paths): """ this method will create a pool of tokens out of the list of paths to documents it will be provided Parameters ---------- classes: list list of the names of the classes documents belong to paths: dictionary dictionary of lists of paths to documents Returns ------- token_pool: dictionary dictionary of lists of lists of tokens. each value bin of dictionary is a has a list of lists, for which each list is of a document and it contains a list of tokens in that document """ token_pool = {} for cl in classes: token_pool[cl] = [] for path in paths[cl]: token_pool[cl].append(tokenizeDoc(path)) return token_pool def saveDictToFile(tdict, filename): """ this method will save the key/value pair of the dictionary to a csv file Parameters ---------- tdict: dictionary dictionary object containing many pairs of key and value filename: str name of the dictionary file Returns ------- """ import csv w = csv.writer(open(filename, "w")) for key, val in tdict.items(): row = [] row.append(key) row.append(val[idx_lbl]) for cl in class_titles: if cl in val: row.append(cl + ':' + str(val[cl])) w.writerow(row) def readFileToDict(filename): """ this method will create a dictionary from a file *dictionary file is a csv file, each row contains a token and it's index Parameters ---------- filename: str name of the dictionary file Returns ------- tdict: dictionary dictionary object created from input file """ import csv, codecs tdict = {} for row in csv.reader(codecs.open(filename, 'r')): try: tdict[row[0]] = {} tdict[row[0]][idx_lbl] = int(row[1]) for i in range(2, len(row)): lbl, cnt = row[i].split(':') tdict[row[0]][lbl] = int(cnt) except: continue return tdict def train_test_split(ratio, classes, files): """ this method will split the input list of files to train and test sets. *Note: currently this method uses the simplest way an array can be split in two parts. Parameters ---------- ratio: float ratio of total documents in each class assigned to the training set classes: list list of label classes files: dictionary a dictionary with list of files for each class Returns ------- train_dic: dictionary a dictionary with lists of documents in the training set for each class test_dict: dictionary a dictionary with lists of documents in the testing set for each class """ train_dict = {} test_dict = {} for cl in classes: train_cnt = int(ratio * len(files[cl])) train_dict[cl] = files[cl][:train_cnt] test_dict[cl] = files[cl][train_cnt:] return train_dict, test_dict