Python preprocess.build_save_dataset() Examples

The following are 8 code examples of preprocess.build_save_dataset(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module preprocess , or try the search function .
Example #1
Source File: test_preprocess.py    From ITDD with MIT License 6 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.inputters.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #2
Source File: test_preprocess.py    From data2text-entity-py with MIT License 6 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.io.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #3
Source File: test_preprocess.py    From var-attn with MIT License 6 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.io.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #4
Source File: test_preprocess.py    From OpenNMT-kpg-release with MIT License 6 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.inputters.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        src_reader = onmt.inputters.str2reader[opt.data_type].from_opt(opt)
        tgt_reader = onmt.inputters.str2reader["text"].from_opt(opt)
        preprocess.build_save_dataset(
            'train', fields, src_reader, tgt_reader, opt)

        preprocess.build_save_dataset(
            'valid', fields, src_reader, tgt_reader, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #5
Source File: test_preprocess.py    From BiSET with MIT License 6 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.inputters.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #6
Source File: test_preprocess.py    From DC-NeuralConversation with MIT License 5 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.io.get_fields("text", 0, 0)

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt) 
Example #7
Source File: test_preprocess.py    From encoder-agnostic-adaptation with MIT License 5 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.inputters.get_fields("text", 0, 0)

        if hasattr(opt, 'src_vocab') and len(opt.src_vocab) > 0:
            with codecs.open(opt.src_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')
        if hasattr(opt, 'tgt_vocab') and len(opt.tgt_vocab) > 0:
            with codecs.open(opt.tgt_vocab, 'w', 'utf-8') as f:
                f.write('a\nb\nc\nd\ne\nf\n')

        src_reader = onmt.inputters.str2reader[opt.data_type].from_opt(opt)
        tgt_reader = onmt.inputters.str2reader["text"].from_opt(opt)
        train_data_files = preprocess.build_save_dataset(
            'train', fields, src_reader, tgt_reader, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset(
            'valid', fields, src_reader, tgt_reader, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)
        if hasattr(opt, 'src_vocab') and os.path.exists(opt.src_vocab):
            os.remove(opt.src_vocab)
        if hasattr(opt, 'tgt_vocab') and os.path.exists(opt.tgt_vocab):
            os.remove(opt.tgt_vocab) 
Example #8
Source File: test_preprocess.py    From graph-2-text with MIT License 5 votes vote down vote up
def dataset_build(self, opt):
        fields = onmt.io.get_fields("text", 0, 0)

        train_data_files = preprocess.build_save_dataset('train', fields, opt)

        preprocess.build_save_vocab(train_data_files, fields, opt)

        preprocess.build_save_dataset('valid', fields, opt)

        # Remove the generated *pt files.
        for pt in glob.glob(SAVE_DATA_PREFIX + '*.pt'):
            os.remove(pt)