Python argparse.open() Examples
The following are 13
code examples of argparse.open().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
argparse
, or try the search function
.
Example #1
Source File: apply_bpe.py From subword-nmt with MIT License | 6 votes |
def _process_lines(bpe, filename, outfile, dropout, begin, end): if isinstance(outfile, str): fo = open(outfile, "w", encoding="utf-8") else: fo = outfile with open(filename, encoding="utf-8") as f: f.seek(begin) line = f.readline() while line: pos = f.tell() assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" if end > 0 and pos > end: break fo.write(bpe.process_line(line, dropout)) line = f.readline() if isinstance(outfile, str): fo.close()
Example #2
Source File: apply_bpe.py From subword-nmt with MIT License | 6 votes |
def _process_lines(bpe, filename, outfile, dropout, begin, end): if isinstance(outfile, str): fo = open(outfile, "w", encoding="utf-8") else: fo = outfile with open(filename, encoding="utf-8") as f: f.seek(begin) line = f.readline() while line: pos = f.tell() assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" if end > 0 and pos > end: break fo.write(bpe.process_line(line, dropout)) line = f.readline() if isinstance(outfile, str): fo.close()
Example #3
Source File: learn_bpe.py From subword-nmt with MIT License | 6 votes |
def _get_vocabulary(infile, outfile, begin, end): import pickle vocab = Counter() with open(infile, encoding="utf8") as f: f.seek(begin) line = f.readline() while line: pos = f.tell() assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" if end > 0 and pos > end: break for word in line.strip('\r\n ').split(' '): if word: vocab[word] += 1 line = f.readline() with open(outfile, 'wb') as f: pickle.dump(vocab, f)
Example #4
Source File: learn_bpe.py From subword-nmt with MIT License | 6 votes |
def _get_vocabulary(infile, outfile, begin, end): import pickle vocab = Counter() with open(infile, encoding="utf8") as f: f.seek(begin) line = f.readline() while line: pos = f.tell() assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" if end > 0 and pos > end: break for word in line.strip('\r\n ').split(' '): if word: vocab[word] += 1 line = f.readline() with open(outfile, 'wb') as f: pickle.dump(vocab, f)
Example #5
Source File: apply_bpe.py From crosentgec with GNU General Public License v3.0 | 5 votes |
def __init__(self, codes, separator='@@'): with codecs.open(codes.name, encoding='utf-8') as codes: self.bpe_codes = [tuple(item.split()) for item in codes] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) self.separator = separator
Example #6
Source File: apply_bpe.py From knmt with GNU General Public License v3.0 | 5 votes |
def __init__(self, codes, separator='__'): with io.open(codes.name, 'rt', encoding='utf-8') as codes: self.bpe_codes = [tuple(item.split()) for item in codes] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([(code, i) for (i, code) in reversed(list(enumerate(self.bpe_codes)))]) self.separator = separator self.cache = {}
Example #7
Source File: bpe.py From NJUNMT-pytorch with MIT License | 5 votes |
def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None): with codecs.open(codes, encoding="utf-8") as codes: # check version information firstline = codes.readline() if firstline.startswith('#version:'): self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")]) else: self.version = (0, 1) codes.seek(0) self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()]) self.separator = separator self.vocab = vocab self.glossaries = glossaries if glossaries else [] self.cache = {}
Example #8
Source File: apply_bpe.py From mlconvgec2018 with GNU General Public License v3.0 | 5 votes |
def __init__(self, codes, separator='@@'): with codecs.open(codes.name, encoding='utf-8') as codes: self.bpe_codes = [tuple(item.split()) for item in codes] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) self.separator = separator
Example #9
Source File: apply_bpe.py From subword-nmt with MIT License | 5 votes |
def process_lines(self, filename, outfile, dropout=0, num_workers=1): if sys.version_info < (3, 0): print("Parallel mode is only supported in Python3.") sys.exit(1) if num_workers == 1: _process_lines(self, filename, outfile, dropout, 0, 0) elif num_workers > 1: with open(filename, encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = int(size / num_workers) offsets = [0 for _ in range(num_workers + 1)] for i in range(1, num_workers): f.seek(chunk_size * i) pos = f.tell() while True: try: line = f.readline() break except UnicodeDecodeError: pos -= 1 f.seek(pos) offsets[i] = f.tell() assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" res_files = [] pool = Pool(processes=num_workers) for i in range(num_workers): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() res_files.append(tmp) pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1])) pool.close() pool.join() for i in range(num_workers): with open(res_files[i].name, encoding="utf-8") as fi: for line in fi: outfile.write(line) os.remove(res_files[i].name) else: raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
Example #10
Source File: apply_bpe.py From subword-nmt with MIT License | 5 votes |
def process_lines(self, filename, outfile, dropout=0, num_workers=1): if sys.version_info < (3, 0): print("Parallel mode is only supported in Python3.") sys.exit(1) if num_workers == 1: _process_lines(self, filename, outfile, dropout, 0, 0) elif num_workers > 1: with open(filename, encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = int(size / num_workers) offsets = [0 for _ in range(num_workers + 1)] for i in range(1, num_workers): f.seek(chunk_size * i) pos = f.tell() while True: try: line = f.readline() break except UnicodeDecodeError: pos -= 1 f.seek(pos) offsets[i] = f.tell() assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" res_files = [] pool = Pool(processes=num_workers) for i in range(num_workers): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() res_files.append(tmp) pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1])) pool.close() pool.join() for i in range(num_workers): with open(res_files[i].name, encoding="utf-8") as fi: for line in fi: outfile.write(line) os.remove(res_files[i].name) else: raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
Example #11
Source File: learn_joint_bpe_and_vocab.py From subword-nmt with MIT License | 4 votes |
def learn_joint_bpe_and_vocab(args): if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write('Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers) tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close()
Example #12
Source File: learn_joint_bpe_and_vocab.py From subword-nmt with MIT License | 4 votes |
def learn_joint_bpe_and_vocab(args): if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write('Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers) tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close()
Example #13
Source File: learn_bpe.py From subword-nmt with MIT License | 4 votes |
def get_vocabulary(fobj, is_dict=False, num_workers=1): """Read text and return dictionary that encodes vocabulary """ vocab = Counter() if is_dict: for i, line in enumerate(fobj): try: word, count = line.strip('\r\n ').split(' ') except: print('Failed reading vocabulary file at line {0}: {1}'.format(i, line)) sys.exit(1) vocab[word] += int(count) elif num_workers == 1 or fobj.name == '<stdin>': if num_workers > 1: warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.") for i, line in enumerate(fobj): for word in line.strip('\r\n ').split(' '): if word: vocab[word] += 1 elif num_workers > 1: if sys.version_info < (3, 0): print("Parallel mode is only supported in Python3.") sys.exit(1) with open(fobj.name, encoding="utf8") as f: size = os.fstat(f.fileno()).st_size chunk_size = int(size / num_workers) offsets = [0 for _ in range(num_workers + 1)] for i in range(1, num_workers): f.seek(chunk_size * i) pos = f.tell() while True: try: line = f.readline() break except UnicodeDecodeError: pos -= 1 f.seek(pos) offsets[i] = f.tell() assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" vocab_files = [] pool = Pool(processes=num_workers) for i in range(num_workers): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() vocab_files.append(tmp) pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1])) pool.close() pool.join() import pickle for i in range(num_workers): with open(vocab_files[i].name, 'rb') as f: vocab += pickle.load(f) os.remove(vocab_files[i].name) else: raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) return vocab