Python argparse.open() Examples

The following are 13 code examples of argparse.open(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module argparse , or try the search function

Example #1

Source File: apply_bpe.py From subword-nmt with MIT License

6 votes

def _process_lines(bpe, filename, outfile, dropout, begin, end):
    if isinstance(outfile, str):
        fo = open(outfile, "w", encoding="utf-8")
    else:
        fo = outfile
    with open(filename, encoding="utf-8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            fo.write(bpe.process_line(line, dropout))
            line = f.readline()
    if isinstance(outfile, str):
        fo.close()

Example #2

Source File: apply_bpe.py From subword-nmt with MIT License

6 votes

def _process_lines(bpe, filename, outfile, dropout, begin, end):
    if isinstance(outfile, str):
        fo = open(outfile, "w", encoding="utf-8")
    else:
        fo = outfile
    with open(filename, encoding="utf-8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            fo.write(bpe.process_line(line, dropout))
            line = f.readline()
    if isinstance(outfile, str):
        fo.close()

Example #3

Source File: learn_bpe.py From subword-nmt with MIT License

6 votes

def _get_vocabulary(infile, outfile, begin, end):
    import pickle
    vocab = Counter()
    with open(infile, encoding="utf8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
            line = f.readline()
    with open(outfile, 'wb') as f:
        pickle.dump(vocab, f)

Example #4

Source File: learn_bpe.py From subword-nmt with MIT License

6 votes

def _get_vocabulary(infile, outfile, begin, end):
    import pickle
    vocab = Counter()
    with open(infile, encoding="utf8") as f:
        f.seek(begin)
        line = f.readline()
        while line:
            pos = f.tell()
            assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'"
            if end > 0 and pos > end:
                break
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
            line = f.readline()
    with open(outfile, 'wb') as f:
        pickle.dump(vocab, f)

Example #5

Source File: apply_bpe.py From crosentgec with GNU General Public License v3.0

5 votes

def __init__(self, codes, separator='@@'):            
        
        with codecs.open(codes.name, encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]
         
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator

Example #6

Source File: apply_bpe.py From knmt with GNU General Public License v3.0

5 votes

def __init__(self, codes, separator='__'):

        with io.open(codes.name, 'rt', encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code, i) for (i, code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator
        self.cache = {}

Example #7

Source File: bpe.py From NJUNMT-pytorch with MIT License

5 votes

def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):

        with codecs.open(codes, encoding="utf-8") as codes:

            # check version information
            firstline = codes.readline()
            if firstline.startswith('#version:'):
                self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
            else:
                self.version = (0, 1)
                codes.seek(0)

            self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        self.glossaries = glossaries if glossaries else []

        self.cache = {}

Example #8

Source File: apply_bpe.py From mlconvgec2018 with GNU General Public License v3.0

5 votes

def __init__(self, codes, separator='@@'):            
        
        with codecs.open(codes.name, encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]
         
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator

Example #9

Source File: apply_bpe.py From subword-nmt with MIT License

5 votes

def process_lines(self, filename, outfile, dropout=0, num_workers=1):

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        if num_workers == 1:
            _process_lines(self, filename, outfile, dropout, 0, 0)
        elif num_workers > 1:
            with open(filename, encoding="utf-8") as f:
                size = os.fstat(f.fileno()).st_size
                chunk_size = int(size / num_workers)
                offsets = [0 for _ in range(num_workers + 1)]
                for i in range(1, num_workers):
                    f.seek(chunk_size * i)
                    pos = f.tell()
                    while True:
                        try:
                            line = f.readline()
                            break
                        except UnicodeDecodeError:
                            pos -= 1
                            f.seek(pos)
                    offsets[i] = f.tell()
                    assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
            res_files = []
            pool = Pool(processes=num_workers)
            for i in range(num_workers):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                tmp.close()
                res_files.append(tmp)
                pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
            pool.close()
            pool.join()
            for i in range(num_workers):
                with open(res_files[i].name, encoding="utf-8") as fi:
                    for line in fi:
                        outfile.write(line)
                os.remove(res_files[i].name)
        else:
            raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))

Example #10

Source File: apply_bpe.py From subword-nmt with MIT License

5 votes

def process_lines(self, filename, outfile, dropout=0, num_workers=1):

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        if num_workers == 1:
            _process_lines(self, filename, outfile, dropout, 0, 0)
        elif num_workers > 1:
            with open(filename, encoding="utf-8") as f:
                size = os.fstat(f.fileno()).st_size
                chunk_size = int(size / num_workers)
                offsets = [0 for _ in range(num_workers + 1)]
                for i in range(1, num_workers):
                    f.seek(chunk_size * i)
                    pos = f.tell()
                    while True:
                        try:
                            line = f.readline()
                            break
                        except UnicodeDecodeError:
                            pos -= 1
                            f.seek(pos)
                    offsets[i] = f.tell()
                    assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"
            res_files = []
            pool = Pool(processes=num_workers)
            for i in range(num_workers):
                tmp = tempfile.NamedTemporaryFile(delete=False)
                tmp.close()
                res_files.append(tmp)
                pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1]))
            pool.close()
            pool.join()
            for i in range(num_workers):
                with open(res_files[i].name, encoding="utf-8") as fi:
                    for line in fi:
                        outfile.write(line)
                os.remove(res_files[i].name)
        else:
            raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))

Example #11

Source File: learn_joint_bpe_and_vocab.py From subword-nmt with MIT License

4 votes

def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close()

Example #12

Source File: learn_joint_bpe_and_vocab.py From subword-nmt with MIT License

4 votes

def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers)

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close()

Example #13

Source File: learn_bpe.py From subword-nmt with MIT License

4 votes

def get_vocabulary(fobj, is_dict=False, num_workers=1):
    """Read text and return dictionary that encodes vocabulary
    """
    vocab = Counter()
    if is_dict:
        for i, line in enumerate(fobj):
            try:
                word, count = line.strip('\r\n ').split(' ')
            except:
                print('Failed reading vocabulary file at line {0}: {1}'.format(i, line))
                sys.exit(1)
            vocab[word] += int(count)
    elif num_workers == 1 or fobj.name == '<stdin>':
        if num_workers > 1:
            warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.")
        for i, line in enumerate(fobj):
            for word in line.strip('\r\n ').split(' '):
                if word:
                    vocab[word] += 1
    elif num_workers > 1:

        if sys.version_info < (3, 0):
            print("Parallel mode is only supported in Python3.")
            sys.exit(1)

        with open(fobj.name, encoding="utf8") as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = int(size / num_workers)
            offsets = [0 for _ in range(num_workers + 1)]
            for i in range(1, num_workers):
                f.seek(chunk_size * i)
                pos = f.tell()
                while True:
                    try:
                        line = f.readline()
                        break
                    except UnicodeDecodeError:
                        pos -= 1
                        f.seek(pos)
                offsets[i] = f.tell()
                assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'"

        vocab_files = []
        pool = Pool(processes=num_workers)
        for i in range(num_workers):
            tmp = tempfile.NamedTemporaryFile(delete=False)
            tmp.close()
            vocab_files.append(tmp)
            pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1]))
        pool.close()
        pool.join()
        import pickle
        for i in range(num_workers):
            with open(vocab_files[i].name, 'rb') as f:
                vocab += pickle.load(f)
            os.remove(vocab_files[i].name)
    else:
        raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers))
    return vocab