# See the original pyTweetCleaner at https://github.com/kevalmorabia97/pyTweetCleaner """ REMOVE: TWEETS THAT HAVE in_reply_to_status_id !=null i.e. COMMENTS ON SOMEONE ELSE'S TWEETS TWEETS THAT HAVE lang != en i.e. NOT IN ENGLISH LANGUAGE DATA ABOUT DELETED TWEETS NON-ASCII CHARACTERS FROM text HYPERLINKS FROM text STOPWORDS from text #tags & @name mentions digits[0-9] FROM text KEEP: created_at id text IN LOWERCASE user_id user_followers_count retweet_count entities_hashtags, entities_user_mentions retweeted_status REPLACES ACTUAL tweet BUT PRESERVES RETWEETER'S USER INFO """ import json import re import string from nltk.corpus import stopwords from nltk.tokenize import word_tokenize class TweetCleaner: def __init__(self, remove_stop_words=True, remove_retweets=False): """ clean unnecessary twitter data """ if remove_stop_words: self.stop_words = set(stopwords.words('english')) else: self.stop_words = set() self.remove_retweets = remove_retweets self.punc_table = str.maketrans("", "", string.punctuation) # to remove punctuation from each word in tokenize def remove_non_ascii_chars(self, text): """ return text after removing non-ascii characters i.e. characters with ascii value >= 128 """ return ''.join([w if ord(w) < 128 else ' ' for w in text]) def remove_hyperlinks(self,text): """ return text after removing hyperlinks """ return ' '.join([w for w in text.split(' ') if not 'http' in w]) def get_cleaned_text(self, text): cleaned_text = text.replace('\"','').replace('\'','').replace('-',' ') cleaned_text = self.remove_non_ascii_chars(cleaned_text) cleaned_text = self.remove_hyperlinks(cleaned_text) # remove digits cleaned_text = re.sub('[0-9]+','',cleaned_text) # remove hashtags cleaned_text = re.sub('#[_a-zA-Z][_a-zA-Z0-9]*','',cleaned_text) # remove @name cleaned_text = ' '.join([w for w in cleaned_text.split() if not w.startswith('@')]) tokens = [w.translate(self.punc_table) for w in word_tokenize(cleaned_text)] # remove punctuations and tokenize tokens = [w.lower() for w in tokens if not w.lower() in self.stop_words and len(w)>1] cleaned_text = ' '.join(tokens) return cleaned_text def clean_tweets(self, input_file, output_file='cleaned_tweets.json'): """ input_file: name or path of input twitter json data output_file: file name or path where cleaned twitter json data is stored (default='cleaned_tweets.json') """ in_file = open(input_file, 'r') out_file = open(output_file, 'w') while True: line = in_file.readline() if line=='': break tweet = json.loads(line) if 'retweeted_status' in tweet: if self.remove_retweets: continue #print('retweet') retweeter_user_info = tweet['user'] tweet = tweet['retweeted_status'] # replace retweet with original tweet but preserve user info of retweeter tweet['retweet_count'] = 0 tweet['user'] = retweeter_user_info if not "created_at" in tweet: continue # remove info about deleted tweets if not tweet['lang'] == 'en': continue # remove tweets in non engligh(or lang) language if not tweet['in_reply_to_status_id'] == None or not tweet['in_reply_to_user_id'] == None: continue # remove comments of any tweet cleaned_text = self.get_cleaned_text(tweet['text']) if cleaned_text == '': continue cleaned_tweet = {} cleaned_tweet['created_at'] = tweet['created_at'] cleaned_tweet['id'] = tweet['id'] cleaned_tweet['text'] = cleaned_text cleaned_tweet['user'] = {} cleaned_tweet['user']['id'] = tweet['user']['id'] cleaned_tweet['user']['followers_count'] = tweet['user']['followers_count'] cleaned_tweet['retweet_count'] = tweet['retweet_count'] cleaned_tweet['entities'] = {} cleaned_tweet['entities']['hashtags'] = [ht['text'] for ht in tweet['entities']['hashtags']] cleaned_tweet['entities']['user_mentions'] = [self.remove_non_ascii_chars(um['name']) for um in tweet['entities']['user_mentions']] out_file.write(json.dumps(cleaned_tweet)+'\n') in_file.close() out_file.close() if __name__ == '__main__': tc = TweetCleaner(remove_stop_words=True, remove_retweets=True) print(tc.get_cleaned_text('The movie The Twilight Saga: New Moon was the biggest winner in MTV Movie Awards 2010; it took 4 out of 10 "Best" Awards #IAmGreat')) print('\nTweetCleaning DONE...')