# -*- coding: utf-8 -*- """ Created on Sat Nov 4 12:00:49 2017 @author: NishitP """ #import os import pandas as pd import csv import numpy as np import nltk from nltk.stem import SnowballStemmer from nltk.stem.porter import PorterStemmer from nltk.tokenize import word_tokenize import seaborn as sb #before reading the files, setup the working directory to point to project repo #reading data files test_filename = 'test.csv' train_filename = 'train.csv' valid_filename = 'valid.csv' train_news = pd.read_csv(train_filename) test_news = pd.read_csv(test_filename) valid_news = pd.read_csv(valid_filename) #data observation def data_obs(): print("training dataset size:") print(train_news.shape) print(train_news.head(10)) #below dataset were used for testing and validation purposes print(test_news.shape) print(test_news.head(10)) print(valid_news.shape) print(valid_news.head(10)) #check the data by calling below function #data_obs() #distribution of classes for prediction def create_distribution(dataFile): return sb.countplot(x='Label', data=dataFile, palette='hls') #by calling below we can see that training, test and valid data seems to be failry evenly distributed between the classes create_distribution(train_news) create_distribution(test_news) create_distribution(valid_news) #data integrity check (missing label values) #none of the datasets contains missing values therefore no cleaning required def data_qualityCheck(): print("Checking data qualitites...") train_news.isnull().sum() train_news.info() print("check finished.") #below datasets were used to test_news.isnull().sum() test_news.info() valid_news.isnull().sum() valid_news.info() #run the below function call to see the quality check results #data_qualityCheck() #eng_stemmer = SnowballStemmer('english') #stopwords = set(nltk.corpus.stopwords.words('english')) #Stemming def stem_tokens(tokens, stemmer): stemmed = [] for token in tokens: stemmed.append(stemmer.stem(token)) return stemmed #process the data def process_data(data,exclude_stopword=True,stem=True): tokens = [w.lower() for w in data] tokens_stemmed = tokens tokens_stemmed = stem_tokens(tokens, eng_stemmer) tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ] return tokens_stemmed #creating ngrams #unigram def create_unigram(words): assert type(words) == list return words #bigram def create_bigrams(words): assert type(words) == list skip = 0 join_str = " " Len = len(words) if Len > 1: lst = [] for i in range(Len-1): for k in range(1,skip+2): if i+k < Len: lst.append(join_str.join([words[i],words[i+k]])) else: #set it as unigram lst = create_unigram(words) return lst """ #trigrams def create_trigrams(words): assert type(words) == list skip == 0 join_str = " " Len = len(words) if L > 2: lst = [] for i in range(1,skip+2): for k1 in range(1, skip+2): for k2 in range(1,skip+2): for i+k1 < Len and i+k1+k2 < Len: lst.append(join_str.join([words[i], words[i+k1],words[i+k1+k2])]) else: #set is as bigram lst = create_bigram(words) return lst """ porter = PorterStemmer() def tokenizer(text): return text.split() def tokenizer_porter(text): return [porter.stem(word) for word in text.split()] #doc = ['runners like running and thus they run','this is a test for tokens'] #tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()]) #show the distribution of labels in the train and test data """def create_datafile(filename) #function to slice the dataframe to keep variables necessary to be used for classification return "return df to be used" """ """#converting multiclass labels present in our datasets to binary class labels for i , row in data_TrainNews.iterrows(): if (data_TrainNews.iloc[:,0] == "mostly-true" | data_TrainNews.iloc[:,0] == "half-true" | data_TrainNews.iloc[:,0] == "true"): data_TrainNews.iloc[:,0] = "true" else : data_TrainNews.iloc[:,0] = "false" for i,row in data_TrainNews.iterrows(): print(row) """