import codecs import re import imp from alchemyapi import AlchemyAPI import json import xml.sax import nltk import ast from nltk.corpus import stopwords from collections import Counter import nltk.data from nltk.tokenize import word_tokenize import math stops = stopwords.words("english") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ############################################ # Add Full Stops if not?,!,'," # ############################################ def add_full_stops_to_the_end(infile, outfile): #clean data of small titles nad add full stops for NLTK to work output_format = '{}.\n'.format with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout: for line in fin: if line[0] == ' ': pass #ignore headlines with less than three words elif len(line.split()) <= 3: pass elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'): print >> fout, line.decode('utf-8'), else: print >> fout, output_format(line.strip()).decode('utf-8'), ############################################ # Convert All except first word and quotes # to lower case # ############################################ def convert_to_lower_case(infile, outfile): f = open(infile) f2 = codecs.open(outfile, "w+", "utf-8") for l in f: #In quotes text are Ok if len(re.findall('\"(.+?)\"', l)): start = l.find('"') + 1 end = l.find('"', start) temp = len(l.split(' ', 1)[0]) + 1 if start == 1: k = '"' + l[start:end].decode('utf-8') + l[end:].decode('utf-8').lower() else: k = l.split(' ', 1)[0].decode('utf-8') + " " + (l.split(' ', 1)[1]).decode('utf-8')[:start-temp].lower() + l[start:end].decode('utf-8') + l[end:].decode('utf-8').lower() print >> f2, k, else: print >> f2, l.split(' ', 1)[0].decode('utf-8') + " " + l.split(' ', 1)[1].decode('utf-8').lower(), ############################################ # Perform entity recognition to re-cap # proper nouns for better POS tagging ############################################ def convert_to_clean_titles(infile, outfile): alchemyapi = AlchemyAPI() f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") f3 = codecs.open("Entities.txt", "w+", "utf-8") count = 1 for line in f: line = line.decode("utf-8") response = alchemyapi.entities('text', line, {'sentiment': 1, 'disambiguate' : 1}) if response['status'] == 'OK': for entity in response['entities']: if "type" in entity.keys: if entity['type'] in ['Country', 'Holiday', 'Movie', 'MusicGroup', 'Organization', 'Person', 'PrintMedia', 'Region', 'StateOrCountry', 'TelevisionShow', 'TelevisionStation', 'Money', 'Company', 'GeographicFeature']: line = line.replace(entity['text'], entity['text'].title()) print >> f3, entity['text'], entity['type'], entity['sentiment'] print >> f2, line, else: print >> f2, line, print count, line count += 1 def handle_indian_actors(infile, outfile): f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") f3 = open("actors_final.txt", "r") actors = f3.readline().split(',') for line in f: line = line.decode("utf-8") words = line.split() for w in words: if w.title() in actors: line = line.replace(w, w.title()) print >> f2, line, def handle_multiple_sentences(infile, outfile): titles = [] f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") for line in f: line = line.decode("utf-8") sentences = sent_detector.tokenize(line.strip()) for i in range(len(sentences)): if i == 0: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) else: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::") titles.append(" ".join(sentences)) title_set = set(titles) for l in title_set: print >> f2, l def normalise_numbers_quotes(infile, outfile): pass #done in file class HeadlineContentHandler(xml.sax.ContentHandler): def __init__(self, outfile): xml.sax.ContentHandler.__init__(self) self.outfile = outfile self.currentData = "" self.parse = "" self.trip = 0 self.type = "" self._charBuffer = [] self.dict = {} self.dict_token = {} self.depidx = -1 self.govidx = -1 self.governor = () self.dependent = () def _flushCharBuffer(self): s = ''.join(self._charBuffer) self._charBuffer = [] return s def startElement(self, tag, attrs): self.currentData = tag if tag == "sentence": self.dict["tokens"] = [] self.dict["id"] = int(attrs.getValue("id")) self.basic = [] self.cc_processed = [] self.collapsed = [] elif tag == "token": print attrs.getValue("id") self.dict_token["id"] = int(attrs.getValue("id")) elif tag == "dependencies" and attrs.getValue("type") == "basic-dependencies": self.trip = 1 elif tag == "dependencies" and attrs.getValue("type") == "collapsed-dependencies": self.trip = 2 elif tag == "dependencies" and attrs.getValue("type") == "collapsed-ccprocessed-dependencies": self.trip = 3 elif tag == "dep": self.type = attrs.getValue("type") elif tag == "dependent": self.depidx = attrs.getValue("idx") elif tag == "governor": self.govidx = attrs.getValue("idx") def endElement(self, tag): if tag == "sentence": self.dict["basic-dependencies"] = self.basic self.dict["collapsed-dependencies"] = self.collapsed self.dict["collapsed-ccprocessed-dependencies"] = self.cc_processed print >> self.outfile, self.dict self.dict = {} elif tag == "token": self.dict["tokens"].append(self.dict_token) self.dict_token = {} self.dict["subject"] = [] elif tag == "parse": self.dict["parse"] = self._flushCharBuffer().strip() elif tag == "word": self.dict_token["word"] = self._flushCharBuffer().strip() elif tag == "lemma": self.dict_token["lemma"] = self._flushCharBuffer().strip() elif tag == "POS": self.dict_token["pos"] = self._flushCharBuffer().strip() elif tag == "NER": self.dict_token["ser"] = self._flushCharBuffer().strip() elif tag == "sentiment": self.dict_token["sentiment"] = self._flushCharBuffer().strip() elif tag == "governor": self.governor = (self.govidx, self._flushCharBuffer().strip()) elif tag == "dependent": self.dependent = (self.depidx, self._flushCharBuffer().strip()) elif tag == "dep": #create type and gov, dep tuple list list = [self.type, self.governor, self.dependent] if self.trip == 1: self.basic.append(list) elif self.trip == 2: self.collapsed.append(list) elif self.trip == 3: self.cc_processed.append(list) def characters(self, content): if self.currentData == "parse": self._charBuffer.append(content) elif self.currentData == "word": self._charBuffer.append(content) elif self.currentData == "lemma": self._charBuffer.append(content) elif self.currentData == "POS": self._charBuffer.append(content) elif self.currentData == "NER": self._charBuffer.append(content) elif self.currentData == "sentiment": self._charBuffer.append(content) elif self.currentData == "dependent": self._charBuffer.append(content) elif self.currentData == "governor": self._charBuffer.append(content) def stanford_nlp_processor(infile, outfile): f2 = codecs.open(outfile, "a+", "utf-8") parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 0) Handler = HeadlineContentHandler(f2) parser.setContentHandler(Handler) parser.parse(infile) def join_multiple_sentences(infile, outfile): f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") lines = f.readlines() i = 0 while i < len(lines): d = dict() pos = [] sen = [] sentence = ast.literal_eval(lines[i]) if (i < len(lines)-1 ): next_sentence = ast.literal_eval(lines[i+1]) words_ = next_sentence["tokens"] if words_[0]['pos'] == ':': #combine the dictionaries of sentence and next_sentence d['tokens'] = sentence['tokens'] + next_sentence['tokens'] d['subject'] = sentence['subject'] + next_sentence['subject'] d['parse'] = sentence['parse'] + next_sentence['parse'] d["basic-dependencies"] = sentence['basic-dependencies'] + next_sentence['basic-dependencies'] d["collapsed-dependencies"] = sentence['collapsed-dependencies'] + next_sentence['collapsed-dependencies'] d["collapsed-ccprocessed-dependencies"] = sentence['collapsed-ccprocessed-dependencies'] + next_sentence['collapsed-ccprocessed-dependencies'] print >> f2, d i += 1 else: #print the dictionaries as it is print >> f2, sentence i += 1 def n_gram_analysis_simple(infile, gram, stop): ngram = dict() f = open(infile, "r" ) #f2 = codecs.open(outfile, "w+", "utf-8") for l in f: x = nltk.ngrams(l.split(),gram) for w in x: # if stop: # if w not in stops: # if w in ngram: # ngram[w]+=1 # else: # ngram[w]=1 if w in ngram: ngram[w] += 1 else: ngram[w] = 1 p = list(ngram.items()) p.sort(key = lambda x: -x[1]) print len(p) for x in p[:10]: sen = ' '.join(x[0]) cnt = int(x[1]) if cnt == 0: cnt = 1 print sen, cnt def create_tag_sentence_dictionary(infile, outfile): tagged_data = open(infile, "r+") f2 = codecs.open(outfile, "a+", "utf-8") for line in tagged_data: sentence = ast.literal_eval(line) d = dict() pos = [] sen= [] for word in sentence["tokens"]: pos.append(word['pos']) for word in sentence["tokens"]: sen.append(word['word']) if len(sen) >= 4: d['sentence'] = " ".join(sen) d['pos_sentence'] = " ".join(pos) print >> f2, d def pos_with_common_words(infile, outfile): f = open("common.txt", "r") f1 = open(infile, "r+") f2 = codecs.open(outfile, "w+", "utf-8") common_words = set(f.readline().split()) for line in f1: sentence = ast.literal_eval(line) for word in sentence["tokens"]: #list of common words needs to be filtered for better clustering results if word['word'].lower() in common_words: print >> f2, word['word'], elif word['pos'] == 'CD': print >> f2, "<D>", else: print >>f2, word['pos'], print >> f2,'\n' def obtain_hyperbolic_terms(infile, outfile): tagged_data = open( infile, "r+") f2 = codecs.open(outfile, "a+" ,"utf-8") words = [] for line in tagged_data: sentence = ast.literal_eval(line) for word in sentence["tokens"]: #list of common words needs to be filtered for better clustering results try: if word['sentiment'] == u'Very negetive': # Positive and word['pos'] in ['JJ', 'JJS', 'JJR', 'RB', 'RBS', 'RBR']: words.append(word['word']) except: pass print "Very Positive Words\n" print >> f2, set(words) print "\n\n" def pos_with_entity_replaced_common_words(infile, outfile): alchemyapi = AlchemyAPI() common_word_pos = open("common_word_pos.txt", "r") title_data = open(infile, "r+") f2 = codecs.open(outfile, "w+", "utf-8") for line1, line2 in title_data, common_word_pos: response = alchemyapi.entities('text', line1, {'sentiment': 1, 'disambiguate' : 1}) if response['status'] == 'OK': for entity in response['entities']: line2.replace(entity['text'], entity['type']) print >> f2, line2, def subjects(infile, outfile): f1 = open(infile, "r+") f2 = codecs.open(outfile, "w+", "utf-8") subjects = dict() for l in f1: sentence = ast.literal_eval(l) dependency = sentence['collapsed-ccprocessed-dependencies'] for d in dependency: if d[0] == 'nsubj': sub = d[2][1] if sub in subjects: subjects[sub] += 1 else: subjects[sub] = 1 p = list(subjects.items()) p.sort(key = lambda x: -x[1]) for x in p[:50]: print >> f2, x def dependency_bigram_analysis(infile, outfile): f1 = open(infile, "r+") f2 = codecs.open(outfile, "w+", "utf-8") # f3 = open() relation = dict() example = dict() example_sentence = dict() cnt = 0 for l in f1: sentence = ast.literal_eval(l) words = sentence['tokens'] s = ' '.join([w['word'] for w in words]) pos_dict = dict() for w in words: pos_dict[w['id']] = w['pos'] dependency = sentence['basic-dependencies'] for d in dependency: dep_dict = dict() if math.fabs(int(d[1][0]) - int(d[2][0])) >= 3: r = dep_dict['relation'] = d[0] if r in relation: relation[r] += 1 else: relation[r] = 1 example[r] = [d[1][1], d[2][1]] example_sentence[r] = s cnt += 1 print cnt p = list(relation.items()) p.sort(key = lambda x: -x[1]) for x in p: print >> f2, x, example[x[0]], example_sentence[x[0]] def average_length_sentences(infile): f1 = open(infile, "r+") # f2 = codecs.open(outfile, "w+", "utf-8") cnt = 0 lines = f1.readlines() for line in lines: cnt += len(word_tokenize(line.decode("utf-8"))) avg = float(cnt/len(lines)) print cnt, len(lines), avg def create_histrogram_given_dictionary(d, wonky, title): fig = plt.figure() if wonky: d_ = dict() for key , value in d.iteritems(): d_[key[0]] = value f = {x:70*i for i,x in enumerate(set(d_.keys()))} new_d = dict() for key, value in d_.iteritems(): new_d[f[key]] = value else: f = {x:70*i for i,x in enumerate(set(d.keys()))} new_d = dict() for key, value in d.iteritems(): new_d[f[key]] = value c = list(new_d.items()) X,Y = zip(*c) plt.barh(X,Y,align='center') c = list(f.items()) ticks,pos = zip(*c) pylab.yticks(pos,ticks) matplotlib.rc('ytick', labelsize=8) fig.suptitle(title) plt.show() def analyse_clickbait(): #preprocessing # add_full_stops_to_the_end("titles.txt", "full_stop_titles.txt") # convert_to_lower_case("full_stop_titles.txt", "lower_case_titles.txt") # convert_to_clean_titles("lower_case_titles.txt", "clickbaittitles.txt") # handle_indian_actors("clickbaittitles.txt", "clickbaittitles_indian.txt") # handle_multiple_sentences("clickbaittitles_indian.txt", "final_clickbait.txt") # stanford_nlp_processor("processing/cb7.txt.xml", "cb.out") # join_multiple_sentences("cb.out", "cb_final.out") # create_tag_sentence_dictionary("cb_final.out", "tag_word_dictionary.txt") # pos_with_common_words("/home/bhargavi/Desktop/BTP/news/news.out", "/home/bhargavi/Desktop/BTP/click_bait/classifiers/normalised_titles_.txt") # n_gram_analysis_simple("final_clickbait.txt", 4 , 0) # subjects("cb.out", "subjects.txt") # obtain_hyperbolic_terms("cb.out", "hyperbolic.txt") average_length_sentences("/home/bhargavi/Desktop/BTP/news/final_news.txt") # dependency_bigram_analysis("/home/bhargavi/Desktop/BTP/news/news.out", "/home/bhargavi/Desktop/BTP/news/dependency_news.txt") if __name__ == '__main__': analyse_clickbait()