# -*- coding: utf-8 -*- ''' Reads XML files containing FrameNet 1.$VERSION annotations, and converts them to a CoNLL 2009-like format. ''' import codecs import os.path import sys reload(sys) sys.setdefaultencoding('utf-8') import xml.etree.ElementTree as et from optparse import OptionParser from globalconfig import * from xml_annotations import FrameAnnotation, SentenceAnnotation optpr = OptionParser() optpr.add_option("--filter_embeddings", action="store_true", default=False) optpr.add_option("--exemplar", action="store_true", default=False) (options, args) = optpr.parse_args() logger = open("preprocess-fn{}.log".format(VERSION), "w") trainf = TRAIN_EXEMPLAR ftetrainf = TRAIN_FTE devf = DEV_CONLL testf = TEST_CONLL trainsentf = TRAIN_EXEMPLAR + ".sents" ftetrainsentf = TRAIN_FTE + ".sents" devsentf = DEV_CONLL + ".sents" testsentf = TEST_CONLL + ".sents" relevantfelayers = ["Target", "FE"] relevantposlayers = ["BNC", "PENN"] ns = {'fn' : 'http://framenet.icsi.berkeley.edu'} firsts = {trainf : True, devf : True, testf : True, ftetrainf : True} sizes = {trainf : 0, devf : 0, testf : 0, ftetrainf : 0} totsents = numsentsreused = fspno = numlus = 0.0 isfirst = isfirstsent = True def write_to_conll(outf, fsp, firstex, sentid): mode = "a" if firstex: mode = "w" with codecs.open(outf, mode, "utf-8") as outf: for i in xrange(fsp.sent.size()): token, postag, nltkpostag, nltklemma, lu, frm, role = fsp.info_at_idx(i) outf.write(str(i+1) + "\t") # ID = 0 outf.write(token.encode('utf-8') + "\t") # FORM = 1 outf.write("_\t" + nltklemma + "\t") # LEMMA PLEMMA = 2,3 outf.write(postag + "\t" + nltkpostag + "\t") # POS PPOS = 4,5 outf.write(str(sentid-1) + "\t_\t") # FEAT PFEAT = 6,7 ~ replacing FEAT with sentence number outf.write("_\t_\t") # HEAD PHEAD = 8,9 outf.write("_\t_\t") # DEPREL PDEPREL = 10,11 outf.write(lu + "\t" + frm + "\t") # FILLPRED PRED = 12,13 outf.write(role + "\n") #APREDS = 14 outf.write("\n") # end of sentence outf.close() def write_to_sent_file(outsentf, sentence, isfirstsent): mode = "a" if isfirstsent: mode = "w" with codecs.open(outsentf, mode, "utf-8") as outf: outf.write(sentence + "\n") # end of sentence outf.close() def process_xml_labels(label, layertype): try: st = int(label.attrib["start"]) en = int(label.attrib["end"]) except KeyError: logger.write("\t\tIssue: start and/or end labels missing in " + layertype + "\n") return return (st, en) def process_sent(sent, outsentf, isfirstsent): senttext = "" for t in sent.findall('fn:text', ns): # not a real loop senttext = t.text write_to_sent_file(outsentf, senttext, isfirstsent) sentann = SentenceAnnotation(senttext) for anno in sent.findall('fn:annotationSet', ns): for layer in anno.findall('fn:layer', ns): layertype = layer.attrib["name"] if layertype in relevantposlayers: for label in layer.findall('fn:label', ns): startend = process_xml_labels(label, layertype) sentann.add_token(startend) sentann.add_postag(label.attrib["name"]) if sentann.normalize_tokens(logger) is None: logger.write("\t\tSkipping: incorrect tokenization\n") return break if sentann.foundpos: break if not sentann.foundpos: # TODO do some manual tokenization logger.write("\t\tSkipping: missing POS tags and hence tokenization\n") return return sentann def get_all_fsps_in_sent(sent, sentann, fspno, lex_unit, frame, isfulltextann, corpus): numannosets = 0 fsps = {} fspset = set([]) # get all the FSP annotations for the sentece : it might have multiple targets and hence multiple FSPs for anno in sent.findall('fn:annotationSet', ns): annotation_id = anno.attrib["ID"] if annotation_id == "2019791" and VERSION == "1.5": # Hack to skip an erroneous annotation of Cathedral as raise.v with frame "Growing_food". continue numannosets += 1 if numannosets == 1: continue anno_id = anno.attrib["ID"] if isfulltextann: # happens only for fulltext annotations if "luName" in anno.attrib: if anno.attrib["status"] == "UNANN" and "test" not in corpus: # keep the unannotated frame-elements only for test, to enable comparison continue lex_unit = anno.attrib["luName"] frame = anno.attrib["frameName"] if frame == "Test35": continue # bogus frame else: continue logger.write("\tannotation: " + str(anno_id) + "\t" + frame + "\t" + lex_unit + "\n") fsp = FrameAnnotation(lex_unit, frame, sentann) for layer in anno.findall('fn:layer', ns): # not a real loop layertype = layer.attrib["name"] if layertype not in relevantfelayers: continue if layertype == "Target" : for label in layer.findall('fn:label', ns): # can be a real loop startend = process_xml_labels(label, layertype) if startend is None: break fsp.add_target(startend, logger) elif layer.attrib["name"] == "FE" and layer.attrib["rank"] == "1": for label in layer.findall('fn:label', ns): startend = process_xml_labels(label, layertype) if startend is None: if "itype" in label.attrib: logger.write("\t\tIssue: itype = " + label.attrib["itype"] + "\n") continue else: break fsp.add_fe(startend, label.attrib["name"], logger) if not fsp.foundtarget: logger.write("\t\tSkipping: missing target\n") continue if not fsp.foundfes: logger.write("\t\tIssue: missing FSP annotations\n") if fsp not in fspset: fspno += 1 fsps[anno_id] = fsp fspset.add(fsp) else: logger.write("\t\tRepeated frames encountered for same sentence\n") return numannosets, fspno, fsps def get_annoids(filelist, outf, outsentf): annos = [] isfirstex = True fspno = 0 numsents = 0 invalidsents = 0 repeated = 0 totfsps = 0 sents = set([]) isfirstsentex = True for tfname in filelist: tfname = FULLTEXT_DIR + tfname logger.write("\n" + tfname + "\n") if not os.path.isfile(tfname): continue with codecs.open(tfname, 'rb', 'utf-8') as xml_file: tree = et.parse(xml_file) root = tree.getroot() for sentence in root.iter('{http://framenet.icsi.berkeley.edu}sentence'): numsents += 1 logger.write("sentence:\t" + str(sentence.attrib["ID"]) + "\n") for annotation in sentence.iter('{http://framenet.icsi.berkeley.edu}annotationSet'): annotation_id = annotation.attrib["ID"] if annotation_id == "2019791" and VERSION == "1.5": # Hack to skip an erroneous annotation of Cathedral as raise.v with frame "Growing_food". continue if "luName" in annotation.attrib and "frameName" in annotation.attrib: annos.append(annotation.attrib["ID"]) # get the tokenization and pos tags for a sentence sentann = process_sent(sentence, outsentf, isfirstsentex) isfirstsentex = False if sentann is None: invalidsents += 1 logger.write("\t\tIssue: Token-level annotations not found\n") continue # get all the different FSP annotations in the sentence x, fspno, fsps = get_all_fsps_in_sent(sentence, sentann, fspno, None, None, True, outf) totfsps += len(fsps) if len(fsps) == 0: invalidsents += 1 if sentann.text in sents: repeated += 1 for fsp in fsps.values(): sents.add(sentann.text) write_to_conll(outf, fsp, isfirstex, numsents) sizes[outf] += 1 isfirstex = False xml_file.close() sys.stderr.write("# total sents processed = %d\n" %numsents) sys.stderr.write("# repeated sents = %d\n" %repeated) sys.stderr.write("# invalid sents = %d\n" %invalidsents) sys.stderr.write("# sents in set = %d\n" %len(sents)) sys.stderr.write("# annotations = %d\n" %totfsps) return annos def process_fulltext(): sys.stderr.write("\nReading {} fulltext data ...\n".format(VERSION)) # read and write all the test examples in conll logger.write("\n\nTEST\n\n") sys.stderr.write("TEST\n") test_annos = get_annoids(TEST_FILES, testf, testsentf) # read and write all the dev examples in conll logger.write("\n\nDEV\n\n") sys.stderr.write("DEV\n") dev_annos = get_annoids(DEV_FILES, devf, devsentf) # read all the full-text train examples in conll train_fte_files = [] for f in os.listdir(FULLTEXT_DIR): if f not in TEST_FILES and f not in DEV_FILES and not f.endswith("xsl"): train_fte_files.append(f) logger.write("\n\nFULLTEXT TRAIN\n\n") sys.stderr.write("FULLTEXT TRAIN\n") get_annoids(train_fte_files, ftetrainf, ftetrainsentf) return dev_annos, test_annos def process_lu_xml(lufname, dev_annos, test_annos): global totsents, numsentsreused, fspno, numlus, isfirst, isfirstsent with codecs.open(lufname, 'rb', 'utf-8') as xml_file: tree = et.parse(xml_file) root = tree.getroot() ns ={'fn' : 'http://framenet.icsi.berkeley.edu'} frame = root.attrib["frame"] lex_unit = root.attrib["name"] logger.write("\n" + lufname + "\t" + frame + "\t" + lex_unit + "\n") sentno = 0 for sent in root.iter('{http://framenet.icsi.berkeley.edu}sentence'): sentno += 1 # get the tokenization and pos tags for a sentence sent_id = sent.attrib["ID"] logger.write("sentence:\t" + str(sent_id) + "\n") sentann = process_sent(sent, trainsentf, isfirstsent) isfirstsent = False if sentann is None: logger.write("\t\tIssue: Token-level annotations not found\n") continue # get all the different FSP annotations in the sentence numannosets, fspno, fsps = get_all_fsps_in_sent(sent, sentann, fspno, lex_unit, frame, False, "exemplartrain") for anno_id in fsps: if anno_id in test_annos or anno_id in dev_annos: continue else: write_to_conll(trainf, fsps[anno_id], isfirst, sentno) sizes[trainf] += 1 isfirst = False if numannosets > 2: numsentsreused += (numannosets - 2) numlus += 1 xml_file.close() logger.write(lufname + ": total sents = " + str(sentno) + "\n") totsents += sentno def process_exemplars(dev_annos, test_annos): global totsents, numsentsreused, fspno, numlus, isfirst # get the names of all LU xml files all_lus = [] for f in os.listdir(LU_DIR): luf = os.path.join(LU_DIR, f) if luf.endswith("xsl"): continue all_lus.append(luf) sys.stderr.write("\nReading exemplar data from " + str(len(all_lus)) + " LU files...\n") logger.write("\n\nTRAIN EXEMPLAR\n\n") for i, luname in enumerate(sorted(all_lus), 1): if i % 1000 == 0: sys.stderr.write(str(i) + "...") if not os.path.isfile(luname): logger.write("\t\tIssue: Couldn't find " + luname + " - strange, terminating!\n") break process_lu_xml(luname, dev_annos, test_annos) sys.stderr.write("\n\n# total LU sents = " + str(totsents) + "\n") sys.stderr.write("# total LU FSPs = " + str(fspno) + "\n") sys.stderr.write("# total LU files = " + str(numlus) + "\n") sys.stderr.write("average # FSPs per LU = " + str(fspno / numlus) + "\n") sys.stderr.write("# LU sents reused for multiple annotations = " + str(numsentsreused) + "\n") sys.stderr.write("\noutput file sizes:\n") for s in sizes: sys.stderr.write(s + ":\t" + str(sizes[s]) + "\n") sys.stderr.write("\n") def filter_embeddings(embedding_files): """ Filters the embeddings file to retain only the vocabulary in the train, dev and test files. """ sys.stderr.write("\nReading FrameNet {} vocabulary...\n".format(VERSION)) vocab = set([]) corpora = [DEV_CONLL, TRAIN_FTE, TRAIN_EXEMPLAR, TEST_CONLL] for corpus in corpora: with codecs.open(corpus, "r", "utf-8") as cf: tokens = [line.split("\t")[1].lower() for line in cf if line != "\n"] cf.close() vocab.update(tokens) sys.stderr.write("\nTotal (train + dev + test) vocabulary size = {}\nFiltering out the word vectors ...".format(len(vocab))) for emb_file in embedding_files: embeddings_file = open(DATA_DIR + emb_file, 'r') new_embeddings_file = DATA_DIR.split(".txt")[0] + VERSION + ".framevocab.txt" filtered_embeddings = open(new_embeddings_file, 'w') num_embeddings = 0 for l in embeddings_file: fields = l.strip().split(' ') wd = fields[0].lower() if wd in vocab: filtered_embeddings.write(l) num_embeddings += 1 embeddings_file.close() filtered_embeddings.close() sys.stderr.write("\nTotal embeddings in {} = {}\n".format(new_embeddings_file, num_embeddings)) if __name__ == "__main__": if not os.path.exists(PARSER_DATA_DIR): os.makedirs(PARSER_DATA_DIR) dev, test = process_fulltext() if options.exemplar: process_exemplars(dev, test) if options.filter_embeddings: filter_embeddings([EMBEDDINGS_FILE]) logger.close()