import sys import io import argparse ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC=range(10) def read_conll(inp,max_sent=0,drop_tokens=True,drop_nulls=True): comments=[] sent=[] yielded=0 for line in inp: line=line.strip() if line.startswith("#"): comments.append(line) elif not line: if sent: yield sent,comments yielded+=1 if max_sent>0 and yielded==max_sent: break sent,comments=[],[] else: cols=line.split("\t") if drop_tokens and "-" in cols[ID]: continue if drop_nulls and "." in cols[ID]: continue sent.append(cols) else: if sent: yield sent,comments def launch(args,q_in,q_out): while True: jobid,txt=q_in.get() if jobid=="FINAL": q_out.put((jobid,txt)) return cache=io.StringIO() for sent,comments in read_conll(txt.split("\n"),drop_tokens=False,drop_nulls=False): if comments: print("\n".join(comments),file=cache) for cols in sent: for col in (LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS): cols[col]="_" print("\t".join(cols),file=cache) print(file=cache) q_out.put((jobid,cache.getvalue())) argparser = argparse.ArgumentParser(description='Wipes LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS conllu columns to make sure this is test-grade run, keeps tokens and null words')