import re import string from nltk.tokenize import TweetTokenizer from linguistic_style_transfer_model.config import global_config from linguistic_style_transfer_model.utils import log_initializer source_file_path = "data/sentiment140/train-data.csv" text_file_path = "data/sentiment140/tweets.txt" labels_file_path = "data/sentiment140/sentiment.txt" logger = log_initializer.setup_custom_logger(global_config.logger_name, "INFO") tknzr = TweetTokenizer(strip_handles=True) def clean_word(word): word = "" if (len(word) > 3 and word[:4] == "http") else word word = "" if (any([char not in string.printable for char in word])) else word return word def clean_sentence(s): tokens = tknzr.tokenize(s) s = " ".join([clean_word(x) for x in tokens]) s = re.sub(r"\\n", " ", s) s = re.sub(r"\'m", " am", s) s = re.sub(r"\'ve", " have", s) s = re.sub(r"n\'t", " not", s) s = re.sub(r"\'re", " are", s) s = re.sub(r"\'d", " would", s) s = re.sub(r"\'ll", " will", s) s = re.sub(r'\d+', "number", s) s = s.replace("\r", " ") s = s.replace("\n", " ") s = s.strip().lower() return s with open(file=text_file_path, mode='w') as text_file, \ open(file=labels_file_path, mode='w') as label_file, \ open(file=source_file_path, mode='r', encoding='ISO-8859-1') as source_file: for line in source_file: try: split_line = line.split(",") cleaned_sentence = clean_sentence(split_line[-1][1:-2]) text_file.write(cleaned_sentence + "\n") if int(split_line[0][1:-1]): label_file.write("pos\n") else: label_file.write("neg\n") except Exception: logger.debug("Skipped: {}".format(line))