import importlib.util import os from nltk import word_tokenize from common.dataset.formatter import Formatter from common.dataset.label_schema import LabelSchema from retrieval.filter_uninformative import uninformative def preprocess(p): return p.replace(" ","_").replace("(","-LRB-").replace(")","-RRB-").replace(":","-COLON-").split("#")[0] class FeverFormatter(Formatter): def __init__(self, index, label_schema, tokenizer=None,filtering=None): super().__init__(label_schema) self.index=index self.tokenize = tokenizer if tokenizer is not None else self.nltk_tokenizer self.filtering = None def import_module(filename): spec = importlib.util.spec_from_file_location('filter_doc', filename) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module if filtering is not None: if filtering: self.filtering = import_module(filtering).preprocess def nltk_tokenizer(self,text): return " ".join(word_tokenize(text)) class FEVERGoldFormatter(FeverFormatter): def format_line(self,line): annotation = None if "label" in line: annotation = line["label"] pages = [] if 'predicted_sentences' in line: pages.extend([(ev[0], ev[1]) for ev in line["predicted_sentences"]]) elif 'predicted_pages' in line: pages.extend([(ev[0],-1) for ev in line["predicted_pages"]]) else: for evidence_group in line["evidence"]: pages.extend([(ev[2],ev[3]) for ev in evidence_group]) if self.filtering is not None: for page,_ in pages: if self.filtering({"id":page}) is None: return None if annotation is not None: return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":self.label_schema.get_id(annotation),"label_text":annotation} else: return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":None,"label_text":None} class FEVERPredictionsFormatter(FeverFormatter): def format_line(self,line): annotation = line["label"] if annotation is None: annotation = line["verifiable"] pages = [preprocess(ev[0]) for ev in line["predicted_pages"]] return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":self.label_schema.get_id(annotation),"label_text":annotation} class FEVERPredictions2Formatter(FeverFormatter): def format_line(self,line): annotation = line["label"] if annotation is None: annotation = line["verifiable"] if 'predicted_pages' in line: pages = [ev[0] for ev in line["predicted_pages"]] elif 'evidence' in line: pages = [ev[1] for ev in line["evidence"]] else: pages = [] return {"claim":self.tokenize(line["claim"]), "evidence": pages, "label":self.label_schema.get_id(annotation),"label_text":annotation} class FEVERLabelSchema(LabelSchema): def __init__(self): super().__init__(["supports","refutes","not enough info"])