""" Generates synthetic data from corpora consisting of individual sentences, such as the SICK corpus by replacing random words in each sentence with one of their synonyms found in WordNet. Implemented extension strategy owes to: [1] Mueller et al., "Siamese Recurrent Architectures for Learning Sentence Similarity." [2] Zhang et al., "Character-level convolutional networks for text classification." The extensions are, as expected, reasonably noisy. """ import os import numpy as np import pandas as pd import nltk from nltk import word_tokenize from nltk.corpus import wordnet from pywsd.lesk import simple_lesk, cosine_lesk, adapted_lesk import kenlm class SickExtender(object): """ Extends the SICK sentence similarity corpus with synthetic data generated by substituting synonyms for random content words. Synonyms are obtained via WordNet's synset.lemmas() lookup following the sense disambiguation of the word to be replaced which, in turn, relies on the specified Lesk algorithm - simple, cosine, or adapted. Refer to the pywsd documentation for further information. """ def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5, min_substitutions=2, num_candidates=5, concatenate_corpora=True): self.sick_path = sick_path self.target_directory = target_directory self.lm_path = lm_path self.wsd_algorithm = wsd_algorithm self.sampling_parameter = sampling_parameter self.min_substitutions = min_substitutions self.num_candidates = num_candidates self.concatenate_corpora = concatenate_corpora self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt') self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt') # Filter the original SICK corpus to match the expected format, and create file for LM training if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path): self.filter_sick() if self.lm_path is None: raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.') else: self.language_model = kenlm.LanguageModel(self.lm_path) def create_extension(self): """ Replaces some words within each line of the given file with their WordNet synonyms. Replacement limited to noun, verb, adj, and adv, as those are the POS tags utilized by WordNet.""" # Track the proportion of the corpus already processed counter = 0 # Create path to the SICK extension corpus if self.concatenate_corpora: target_path = os.path.join(self.target_directory, 'extended_sick.txt') else: target_path = os.path.join(self.target_directory, 'sick_extension.txt') # Generate paraphrases via thesaurus-based replacement print('Commencing with the creation of the synthetic SICK examples.') with open(self.filtered_path, 'r') as rf: with open(target_path, 'w') as wf: for line in rf: # Get tokens and POS tags, i.e. sentences == [sent1, sent2] sentences, sim_score = self.line_prep(line) new_line = list() for sentence in sentences: # Store tokens for subsequent reconstruction tokens = sentence[1] # Get the most likely synset for each token disambiguation = self.disambiguate_synset(sentence) # Replace random words with random synonyms candidate_list = self.replace_with_synonyms(disambiguation) if candidate_list is None: continue paraphrase = self.pick_candidate(tokens, candidate_list) new_line.append(paraphrase) # If nothing could be replaced in either sentence, skip the sentence pair if len(new_line) < 2: continue # Add header # wf.write('sentence_A\tsentence_B\trelatedness_score') if self.concatenate_corpora: wf.write(line) wf.write(new_line[0] + '\t' + new_line[1] + '\t' + sim_score) else: wf.write(new_line[0] + '\t' + new_line[1] + '\t' + sim_score) # Basic bookkeeping counter += 1 if counter % 50 == 0 and counter != 0: print('Current progress: Line %d.' % counter) # For quick testing # if counter % 50 == 0 and counter != 0: # break print('The extension sentences for the SICK corpus has been successfully generated.\n' 'It can be found under %s.\n' 'Total amount of new sentence pairs: %d.' % (target_path, counter)) def filter_sick(self): """ Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences followed by their relatedness score. """ # Filter the SICK dataset for sentences and relatedness score only df_origin = pd.read_table(self.sick_path) df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']] # Scale relatedness score to to lie ∈ [0, 1] for training of the classifier df_classify['relatedness_score'] = df_classify['relatedness_score'].apply( lambda x: "{:.4f}".format(float(x)/5.0)) df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']] df_noscore = df_noscore.stack() # Write the filtered set to a .csv file df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False) print('Filtered corpus saved to %s.' % self.filtered_path) # Write a score-free set to a .csv file to be used in the training of the KN language model df_noscore.to_csv(self.noscore_path, index=False, header=False) print('Filtered corpus saved to %s.' % self.noscore_path) def line_prep(self, line): """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """ # Split line into sentences + score s1, s2, sim_score = line.split('\t') # Tokenize s1_tokens = word_tokenize(s1) s2_tokens = word_tokenize(s2) # Assign part of speech tags s1_penn_pos = nltk.pos_tag(s1_tokens) s2_penn_pos = nltk.pos_tag(s2_tokens) # Convert to WordNet POS tags and store word position in sentence for replacement # Each tuple contains (word, WordNet_POS_tag, position) s1_wn_pos = list() s2_wn_pos = list() for idx, item in enumerate(s1_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item))) for idx, item in enumerate(s2_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item))) # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score def disambiguate_synset(self, sentence_plus_lemmas): """ Picks the most likely synset for a lemma provided the context sentence and target word. Utilizes the 'Cosine Lesk' algorithm provided by pywds. """ # Select the disambiguation algorithm if self.wsd_algorithm == 'simple': wsd_function = simple_lesk elif self.wsd_algorithm == 'cosine': wsd_function = cosine_lesk elif self.wsd_algorithm == 'adapted': wsd_function = adapted_lesk else: raise ValueError('Please specify the word sense disambiguation algorithm:\n ' '\'simple\' for Simple Lesk\n' '\'cosine\' for Cosine Lesk\n' '\'adapted\' for Adapted/Extended Lesk') lemmas, context = sentence_plus_lemmas context = ' '.join(context) disambiguated = list() for lemma in lemmas: try: selection = wsd_function(context, lemma[0], pos=lemma[1]) # For simple_lesk disambiguation algorithm, in case no synsets can be found except IndexError: selection = None disambiguated.append((lemma[0], selection, lemma[2])) return disambiguated def replace_with_synonyms(self, disambiguated_lemmas): """ Calculates the distance between a lemma and each of its synonyms and orders them in a list by increasing distance. Uses the """ all_synonyms = list() # Obtain WordNet synonyms for each lemma in the sentence list for idx, lemma in enumerate(disambiguated_lemmas): if lemma[1] is not None: if len(lemma[1].lemma_names()) > 1: synonyms_per_word = ([' '.join(s.split('_')) for s in lemma[1].lemma_names()], idx) all_synonyms.append(synonyms_per_word) # If the sentence cannot be modified, skip it if len(all_synonyms) == 0: return None # Model a geometric distribution with parameter p, following Zhang, Zhao, and LeCun (2015) lower_bound = max(min(self.min_substitutions, len(all_synonyms)), 1) distribution = {i: self.sampling_parameter ** i for i in range(lower_bound, len(all_synonyms) + 1)} sampling_array = list() position = 0 for key in distribution.keys(): occurrences = int(np.round(distribution[key] * 1000)) while occurrences != 0: sampling_array.append(key) position += 1 occurrences -= 1 # Sample n substitutions outputs = list() no_subs = [(l[0], l[2]) for l in disambiguated_lemmas] for _ in range(self.num_candidates): syn_list = all_synonyms[:] candidate = no_subs[:] # Randomly pick the amount of word to replace with synonyms pick = np.random.randint(0, len(sampling_array)) to_replace = sampling_array[pick] # Perform replacement for __ in range(to_replace): # Randomly pick the word to be replaced j = np.random.randint(0, len(syn_list)) # Randomly pick the synonym to replace the word with k = np.random.randint(0, len(syn_list[j][0])) candidate[syn_list[j][1]] = (syn_list[j][0][k], disambiguated_lemmas[syn_list[j][1]][2]) # Remove the sampled synonym set del(syn_list[j]) outputs.append(candidate) return outputs def pick_candidate(self, tokens, candidate_list): """ Picks the most probable paraprase candidate according to the provided language model. """ best_paraphrase = None best_nll = 0 # Reconstruct and rate paraphrases for candidate in candidate_list: for replacement in candidate: tokens[replacement[1]] = replacement[0] paraphrase = ' '.join(tokens) score = self.language_model.score(paraphrase) # Keep the most probable one if abs(score) > best_nll: best_nll = score best_paraphrase = paraphrase return best_paraphrase @staticmethod def get_wordnet_pos(treebank_tag): """ Converts a Penn Tree-Bank part of speech tag into a corresponding WordNet-friendly tag. Borrowed from: http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python. """ if treebank_tag.startswith('J') or treebank_tag.startswith('A'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return 'OTHER'