from nltk.corpus import wordnet as wordnet from nltk.corpus import sentiwordnet as swn import nltk def wordnet_pos_code(tag): '''Translation from nltk tags to Wordnet code''' if tag.startswith('NN'): return wordnet.NOUN elif tag.startswith('VB'): return wordnet.VERB elif tag.startswith('JJ'): return wordnet.ADJ elif tag.startswith('RB'): return wordnet.ADV else: return '' def pos_tag(sentence): '''POS tagging of a sentence.''' tagged_words = [] tokens = nltk.word_tokenize(sentence) tag_tuples = nltk.pos_tag(tokens) for (string, tag) in tag_tuples: token = {'word':string, 'pos':tag} tagged_words.append(token) return tagged_words def word_sense_cdf(word, context, wn_pos): '''Word sense disambiguation in terms of matching words frequency between the context each sense's definition. Adapted from www.slideshare.net/faigg/tutotial-of-sentiment-analysis''' senses = wordnet.synsets(word, wn_pos) if len(senses) > 0: cfd = nltk.ConditionalFreqDist((sense, def_word) for sense in senses for def_word in sense.definition().split() if def_word in context) best_sense = senses[0] for sense in senses: try: if cfd[sense].max() > cfd[best_sense].max(): best_sense = sense except: pass return best_sense else: return None def word_sense_similarity(word, context, dummy = None): '''Another word sense disambiguation technique. It's VERY SLOW. Adapted from: pythonhosted.org/sentiment_classifier''' wordsynsets = wordnet.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: for w in nltk.word_tokenize(context): score = 0.0 for wsynset in wordnet.synsets(w): sim = wordnet.path_similarity(wsynset, synset) if(sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset return result def sentiwordnet_classify(text): '''Breaks a multi sentence text to separate sentences. This improves context for the word sense disambiguation. Returns a class''' score_tot = 0 score_tot_thr = 0 class_tot = 0 class_tot_thr = 0 sentences = nltk.sent_tokenize(text) for sentence in sentences: (score, score_thr) = sentence_score(sentence) score_tot += score score_tot_thr += score_thr #Trust the thresholded value more when classifying if score_tot_thr != 0: clss = 'pos' if score_tot_thr > 0 else 'neg' elif score_tot != 0: clss = 'pos' if score_tot > 0 else 'neg' else: clss = None return clss def sentence_score(text, threshold = 0.75, wsd = word_sense_cdf): '''Classifies a phrase according to sentiment analysis based on WordNet and SentiWordNet. It also computes a thresholded score by ignoring strongly objective words.''' tagged_words = pos_tag(text) obj_score = 0 # object score pos_score=0 # positive score neg_score=0 #negative score pos_score_thr=0 neg_score_thr=0 for word in tagged_words: # print word if 'punct' not in word : sense = wsd(word['word'], text, wordnet_pos_code(word['pos'])) if sense is not None: sent = swn.senti_synset(sense.name()) if sent is not None and sent.obj_score() <> 1: obj_score = obj_score + float(sent.obj_score()) pos_score = pos_score + float(sent.pos_score()) neg_score = neg_score + float(sent.neg_score()) if sent.obj_score() < threshold: pos_score_thr = pos_score_thr + float(sent.pos_score()) neg_score_thr = neg_score_thr + float(sent.neg_score()) return (pos_score - neg_score, pos_score_thr - neg_score_thr)