""" A module with a handful of utility functions for dealing with file I/O """ import codecs import sys import io import os import fileinput COLCOUNT=10 ID,FORM,LEMMA,CPOSTAG,POSTAG,FEATS,HEAD,DEPREL,DEPS,MISC=range(COLCOUNT) COLNAMES=u"ID,FORM,LEMMA,CPOSTAG,POSTAG,FEATS,HEAD,DEPREL,DEPS,MISC".split(u",") def in_out(args,multiple_files=False): """Open the input/output data streams. If multiple_files is set to True, returns an iterator over lines. If set to False, returns an open file. This distinction is needed because validator.py checks the newlines property and needs to get the input as a file, but the other scripts just need the lines so they can work with several files. """ #Decide where to get the data from if args.input is None or args.input=="-": #Stdin inp=codecs.getreader("utf-8")(os.fdopen(0,"U")) #Switched universal newlines on else: #File name given if multiple_files: inp_raw=fileinput.input(files=args.input,mode="U") inp=(line.decode("utf-8") for line in inp_raw) else: inp_raw=open(args.input,mode="U") inp=codecs.getreader("utf-8")(inp_raw) #inp is now an iterator over lines, giving unicode strings if args.output is None or args.output=="-": #stdout out=codecs.getwriter("utf-8")(sys.stdout) else: #File name given out=codecs.open(args.output,"w","utf-8") return inp,out def print_tree(comments,tree,out): if comments: print >> out, u"\n".join(comments) for cols in tree: print >> out, u"\t".join(cols) print >> out def trees(inp): """ `inp` a file-like object yielding lines as unicode Yields the input a tree at a time. """ comments=[] #List of comment lines to go with the current tree lines=[] #List of token/word lines of the current tree for line_counter, line in enumerate(inp): line=line.rstrip() if not line: #empty line if lines: #Sentence done, yield. Skip otherwise. yield comments, lines comments=[] lines=[] elif line[0]==u"#": comments.append(line) elif line[0].isdigit(): cols=line.split(u"\t") if len(cols)!=COLCOUNT: print >> sys.stderr, u"Line %d: The line has %d columns, but %d are expected. Giving up."%(line_counter+1,len(cols),COLCOUNT) sys.exit(1) lines.append(cols) else: #A line which is not a comment, nor a token/word, nor empty. That's bad! #TODO warn! print >> sys.stderr, u"Line %d not conllu: Giving up."%(line_counter+1) sys.exit(1) #Give a non-zero exit code else: #end of file if comments or lines: #Looks like a forgotten empty line at the end of the file, well, okay... yield comments, lines