Python argparse.open() Examples

The following are code examples for showing how to use argparse.open(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: zubr_public   Author: yakazimir   File: learn_bpe.py    GNU General Public License v2.0 6 votes vote down vote up
def from_data(config):
    """Learn BPE from static data in a the pipeline

    :param config: the global config for a pipeline run 
    """
    
    ## generic parser
    parser = create_parser()
    #args = parser.parse_args()
    args = parser.parse_args([])
    
    ## training data
    
    # source side
    args.input = codecs.open(config.atraining+".e",encoding='utf-8')
    args.output = codecs.open(os.path.join(config.dir,"codes.txt"),'w',encoding='utf-8')
    # target side
    main(args.input,args.output,config.num_symbols,args.min_frequency,False,is_dict=args.dict_input)

    ## target (sem) side
    args.input = codecs.open(config.atraining+".f",encoding='utf-8')
    args.output = codecs.open(os.path.join(config.dir,"sem_codes.txt"),'w',encoding='utf-8')
    main(args.input,args.output,config.num_symbols,args.min_frequency,False,is_dict=args.dict_input) 
Example 2
Project: gec-pseudodata   Author: butsugiri   File: generate_pseudo_samples.py    MIT License 5 votes vote down vote up
def read_unigram_freq(path_to_unigram_freq):
    index2word = {}
    word_index_list = []
    with open(path_to_unigram_freq, 'r') as fi:
        for n, line in enumerate(fi):
            token, freq = line.strip().split('\t')
            index2word[n] = token
            word_index_list += [n] * int(freq)
    return index2word, word_index_list 
Example 3
Project: NJUNMT-pytorch   Author: whr94621   File: bpe.py    MIT License 5 votes vote down vote up
def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):

        with codecs.open(codes, encoding="utf-8") as codes:

            # check version information
            firstline = codes.readline()
            if firstline.startswith('#version:'):
                self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
            else:
                self.version = (0, 1)
                codes.seek(0)

            self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        self.glossaries = glossaries if glossaries else []

        self.cache = {} 
Example 4
Project: CAMIT   Author: wengrx   File: bpe.py    MIT License 5 votes vote down vote up
def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None):

        with codecs.open(codes, encoding="utf-8") as codes:

            # check version information
            firstline = codes.readline()
            if firstline.startswith('#version:'):
                self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")])
            else:
                self.version = (0, 1)
                codes.seek(0)

            self.bpe_codes = [tuple(item.split()) for (n, item) in enumerate(codes) if (n < merges or merges == -1)]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        self.glossaries = glossaries if glossaries else []

        self.cache = {} 
Example 5
Project: zubr_public   Author: yakazimir   File: learn_bpe.py    GNU General Public License v2.0 5 votes vote down vote up
def from_dictionary(dictionary,out_file,min_frequency=2,is_dict=False):
    num_symbols = 50000
    vocab = {tuple(("%s</w>" % ' '.join(w.lower())).split()):f for w,f in dictionary.items()}
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    threshold = max(stats.values()) / 10
    seg_map = {}
    outfile = codecs.open(out_file,'w',encoding='utf-8')

    for i in range(num_symbols):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i/(i+10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            #sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold) 
Example 6
Project: mlconvgec2018   Author: nusnlp   File: apply_bpe.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, codes, separator='@@'):            
        
        with codecs.open(codes.name, encoding='utf-8') as codes:
            self.bpe_codes = [tuple(item.split()) for item in codes]
         
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator 
Example 7
Project: VistaMT   Author: isi-vista   File: learn_joint_bpe_and_vocab.py    Apache License 2.0 4 votes vote down vote up
def learn_joint_bpe_and_vocab(args):

    if args.vocab and len(args.input) != len(args.vocab):
        sys.stderr.write('Error: number of input files and vocabulary files must match\n')
        sys.exit(1)

    # read/write files as UTF-8
    args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input]
    args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab]

    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in args.input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()]

    # learn BPE on combined vocabulary
    with codecs.open(args.output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)

    with codecs.open(args.output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=args.separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(args.input, args.vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
        vocab_file.close() 
Example 8
Project: DiTaxa   Author: ehsanasgari   File: npe_efficient.py    Apache License 2.0 4 votes vote down vote up
def train_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """
    outfile_name=outfile
    list_of_seg=[]
    outfile = codecs.open(outfile, 'w', 'utf-8')
    f = codecs.open(frequency_file, 'w', 'utf-8')
    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows bckward compatibility
    outfile.write('#version: 0.2\n')
    list_of_seg.append('#version: 0.2')
    vocab = get_vocabulary(sentenses, is_dict)
    vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()])
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in tqdm.tqdm(range(num_symbols)):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i / (i + 10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        list_of_seg.append('{0} {1} '.format(*most_frequent))
        #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        if verbose:
            sys.stderr.write(
                'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1],
                                                                       stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)
        if not i % 100:
            FileUtility.save_list(outfile_name+'_temp',list_of_seg)


    f.close() 
Example 9
Project: DiTaxa   Author: ehsanasgari   File: npe_efficient.py    Apache License 2.0 4 votes vote down vote up
def train_resampling_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False, resample_size=10000, N=10):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """
    outfile_name=outfile
    list_of_seg=[]
    outfile = codecs.open(outfile, 'w', 'utf-8')
    f = codecs.open(frequency_file, 'w', 'utf-8')
    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows bckward compatibility
    outfile.write('#version: 0.2\n')
    list_of_seg.append('#version: 0.2')


    vocab = get_vocabulary(sentenses, is_dict)
    vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()])
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in tqdm.tqdm(range(num_symbols)):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i / (i + 10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        list_of_seg.append('{0} {1} '.format(*most_frequent))
        #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        if verbose:
            sys.stderr.write(
                'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1],
                                                                       stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)
        if not i % 100:
            FileUtility.save_list(outfile_name+'_temp',list_of_seg)


    f.close()