Python argparse.open() Examples

The following are 13 code examples for showing how to use argparse.open(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module argparse , or try the search function .

Example 1
Project: subword-nmt   Author: rsennrich   File: apply_bpe.py    License: MIT License 6 votes vote down vote up
def _process_lines(bpe, filename, outfile, dropout, begin, end):
    if isinstance(outfile, str):
        fo = open(outfile, "w", encoding="utf-8")
    else:
        fo = outfile
    with open(filename, encoding="utf-8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            fo.write(bpe.process_line(line, dropout))
            line = f.readline()
    if isinstance(outfile, str):
        fo.close() 
Example 2
Project: subword-nmt   Author: rsennrich   File: apply_bpe.py    License: MIT License 6 votes vote down vote up
def _process_lines(bpe, filename, outfile, dropout, begin, end):
    if isinstance(outfile, str):
        fo = open(outfile, "w", encoding="utf-8")
    else:
        fo = outfile
    with open(filename, encoding="utf-8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            fo.write(bpe.process_line(line, dropout))
            line = f.readline()
    if isinstance(outfile, str):
        fo.close() 
Example 3
Project: subword-nmt   Author: rsennrich   File: learn_bpe.py    License: MIT License 6 votes vote down vote up
def _get_vocabulary(infile, outfile, begin, end):
    import pickle
    vocab = Counter()
    with open(infile, encoding="utf8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
            line = f.readline()
    with open(outfile, 'wb') as f:
        pickle.dump(vocab, f) 
Example 4
Project: subword-nmt   Author: rsennrich   File: learn_bpe.py    License: MIT License 6 votes vote down vote up
def _get_vocabulary(infile, outfile, begin, end):
    import pickle
    vocab = Counter()
    with open(infile, encoding="utf8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
            line = f.readline()
    with open(outfile, 'wb') as f:
        pickle.dump(vocab, f) 
Example 5
Project: crosentgec   Author: nusnlp   File: apply_bpe.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, codes, separator='@@'):            
        
        with codecs.open(codes.name, encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]
         
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator 
Example 6
Project: knmt   Author: fabiencro   File: apply_bpe.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, codes, separator='__'):

        with io.open(codes.name, 'rt', encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code, i) for (i, code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator
        self.cache = {} 
Example 7
Project: NJUNMT-pytorch   Author: whr94621   File: bpe.py    License: MIT License 5 votes vote down vote up
def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):

        with codecs.open(codes, encoding="utf-8") as codes:

            # check version information
            firstline = codes.readline()
            if firstline.startswith('#version:'):
                self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
            else:
                self.version = (0, 1)
                codes.seek(0)

            self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        self.glossaries = glossaries if glossaries else []

        self.cache = {} 
Example 8
Project: mlconvgec2018   Author: nusnlp   File: apply_bpe.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, codes, separator='@@'):            
        
        with codecs.open(codes.name, encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]
         
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator 
Example 9
Project: subword-nmt   Author: rsennrich   File: apply_bpe.py    License: MIT License 5 votes vote down vote up
def process_lines(self, filename, outfile, dropout=0, num_workers=1):

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        if num_workers == 1:
            _process_lines(self, filename, outfile, dropout, 0, 0)
        elif num_workers > 1:
            with open(filename, encoding="utf-8") as f:
                size = os.fstat(f.fileno()).st_size
                chunk_size = int(size / num_workers)
                offsets = [0 for _ in range(num_workers + 1)]
                for i in range(1, num_workers):
                    f.seek(chunk_size * i)
                    pos = f.tell()
                    while True:
                        try:
                            line = f.readline()
                            break
                        except UnicodeDecodeError:
                            pos -= 1
                            f.seek(pos)
                    offsets[i] = f.tell()
                    assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
            res_files = []
            pool = Pool(processes=num_workers)
            for i in range(num_workers):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                tmp.close()
                res_files.append(tmp)
                pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
            pool.close()
            pool.join()
            for i in range(num_workers):
                with open(res_files[i].name, encoding="utf-8") as fi:
                    for line in fi:
                        outfile.write(line)
                os.remove(res_files[i].name)
        else:
            raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) 
Example 10
Project: subword-nmt   Author: rsennrich   File: apply_bpe.py    License: MIT License 5 votes vote down vote up
def process_lines(self, filename, outfile, dropout=0, num_workers=1):

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        if num_workers == 1:
            _process_lines(self, filename, outfile, dropout, 0, 0)
        elif num_workers > 1:
            with open(filename, encoding="utf-8") as f:
                size = os.fstat(f.fileno()).st_size
                chunk_size = int(size / num_workers)
                offsets = [0 for _ in range(num_workers + 1)]
                for i in range(1, num_workers):
                    f.seek(chunk_size * i)
                    pos = f.tell()
                    while True:
                        try:
                            line = f.readline()
                            break
                        except UnicodeDecodeError:
                            pos -= 1
                            f.seek(pos)
                    offsets[i] = f.tell()
                    assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
            res_files = []
            pool = Pool(processes=num_workers)
            for i in range(num_workers):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                tmp.close()
                res_files.append(tmp)
                pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
            pool.close()
            pool.join()
            for i in range(num_workers):
                with open(res_files[i].name, encoding="utf-8") as fi:
                    for line in fi:
                        outfile.write(line)
                os.remove(res_files[i].name)
        else:
            raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) 
Example 11
Project: subword-nmt   Author: rsennrich   File: learn_joint_bpe_and_vocab.py    License: MIT License 4 votes vote down vote up
def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close() 
Example 12
Project: subword-nmt   Author: rsennrich   File: learn_joint_bpe_and_vocab.py    License: MIT License 4 votes vote down vote up
def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close() 
Example 13
Project: subword-nmt   Author: rsennrich   File: learn_bpe.py    License: MIT License 4 votes vote down vote up
def get_vocabulary(fobj, is_dict=False, num_workers=1):
    """Read text and return dictionary that encodes vocabulary
    """
    vocab = Counter()
    if is_dict:
        for i, line in enumerate(fobj):
            try:
                word, count = line.strip('\r\n ').split(' ')
            except:
                print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
                sys.exit(1)
            vocab[word] += int(count)
    elif num_workers == 1 or fobj.name == '<stdin>':
        if num_workers > 1:
            warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
        for i, line in enumerate(fobj):
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
    elif num_workers > 1:

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        with open(fobj.name, encoding="utf8") as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = int(size / num_workers)
            offsets = [0 for _ in range(num_workers + 1)]
            for i in range(1, num_workers):
                f.seek(chunk_size * i)
                pos = f.tell()
                while True:
                    try:
                        line = f.readline()
                        break
                    except UnicodeDecodeError:
                        pos -= 1
                        f.seek(pos)
                offsets[i] = f.tell()
                assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"

        vocab_files = []
        pool = Pool(processes=num_workers)
        for i in range(num_workers):
            tmp = tempfile.NamedTemporaryFile(delete=False)
            tmp.close()
            vocab_files.append(tmp)
            pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1]))
        pool.close()
        pool.join()
        import pickle
        for i in range(num_workers):
            with open(vocab_files[i].name, 'rb') as f:
                vocab += pickle.load(f)
            os.remove(vocab_files[i].name)
    else:
        raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
    return vocab